Home | History | Annotate | Line # | Download | only in arm
      1 #include "arm_asm.h"
      2 // Copyright 2021-2025 The OpenSSL Project Authors. All Rights Reserved.
      3 //
      4 // Licensed under the OpenSSL license (the "License").  You may not use
      5 // this file except in compliance with the License.  You can obtain a copy
      6 // in the file LICENSE in the source distribution or at
      7 // https://www.openssl.org/source/license.html
      8 //
      9 // ====================================================================
     10 // Written by Ben Avison <bavison (at) riscosopen.org> for the OpenSSL
     11 // project. Rights for redistribution and usage in source and binary
     12 // forms are granted according to the OpenSSL license.
     13 // ====================================================================
     14 //
     15 // This implementation is a translation of bsaes-armv7 for AArch64.
     16 // No attempt has been made to carry across the build switches for
     17 // kernel targets, since the Linux kernel crypto support has moved on
     18 // from when it was based on OpenSSL.
     19 
     20 // A lot of hand-scheduling has been performed. Consequently, this code
     21 // doesn't factor out neatly into macros in the same way that the
     22 // AArch32 version did, and there is little to be gained by wrapping it
     23 // up in Perl, and it is presented as pure assembly.
     24 
     25 
     26 #include "crypto/arm_arch.h"
     27 
     28 .text
     29 
     30 
     31 
     32 
     33 
     34 .type	_bsaes_decrypt8,%function
     35 .align	4
     36 // On entry:
     37 //   x9 -> key (previously expanded using _bsaes_key_convert)
     38 //   x10 = number of rounds
     39 //   v0-v7 input data
     40 // On exit:
     41 //   x9-x11 corrupted
     42 //   other general-purpose registers preserved
     43 //   v0-v7 output data
     44 //   v11-v15 preserved
     45 //   other SIMD registers corrupted
     46 _bsaes_decrypt8:
     47 	ldr	q8, [x9], #16
     48 	adrp	x11, .LM0ISR
     49 	add	x11, x11, #:lo12:.LM0ISR
     50 	movi	v9.16b, #0x55
     51 	ldr	q10, [x11], #16
     52 	movi	v16.16b, #0x33
     53 	movi	v17.16b, #0x0f
     54 	sub	x10, x10, #1
     55 	eor	v0.16b, v0.16b, v8.16b
     56 	eor	v1.16b, v1.16b, v8.16b
     57 	eor	v2.16b, v2.16b, v8.16b
     58 	eor	v4.16b, v4.16b, v8.16b
     59 	eor	v3.16b, v3.16b, v8.16b
     60 	eor	v5.16b, v5.16b, v8.16b
     61 	tbl	v0.16b, {v0.16b}, v10.16b
     62 	tbl	v1.16b, {v1.16b}, v10.16b
     63 	tbl	v2.16b, {v2.16b}, v10.16b
     64 	tbl	v4.16b, {v4.16b}, v10.16b
     65 	eor	v6.16b, v6.16b, v8.16b
     66 	eor	v7.16b, v7.16b, v8.16b
     67 	tbl	v3.16b, {v3.16b}, v10.16b
     68 	tbl	v5.16b, {v5.16b}, v10.16b
     69 	tbl	v6.16b, {v6.16b}, v10.16b
     70 	ushr	v8.2d, v0.2d, #1
     71 	tbl	v7.16b, {v7.16b}, v10.16b
     72 	ushr	v10.2d, v4.2d, #1
     73 	ushr	v18.2d, v2.2d, #1
     74 	eor	v8.16b, v8.16b, v1.16b
     75 	ushr	v19.2d, v6.2d, #1
     76 	eor	v10.16b, v10.16b, v5.16b
     77 	eor	v18.16b, v18.16b, v3.16b
     78 	and	v8.16b, v8.16b, v9.16b
     79 	eor	v19.16b, v19.16b, v7.16b
     80 	and	v10.16b, v10.16b, v9.16b
     81 	and	v18.16b, v18.16b, v9.16b
     82 	eor	v1.16b, v1.16b, v8.16b
     83 	shl	v8.2d, v8.2d, #1
     84 	and	v9.16b, v19.16b, v9.16b
     85 	eor	v5.16b, v5.16b, v10.16b
     86 	shl	v10.2d, v10.2d, #1
     87 	eor	v3.16b, v3.16b, v18.16b
     88 	shl	v18.2d, v18.2d, #1
     89 	eor	v0.16b, v0.16b, v8.16b
     90 	shl	v8.2d, v9.2d, #1
     91 	eor	v7.16b, v7.16b, v9.16b
     92 	eor	v4.16b, v4.16b, v10.16b
     93 	eor	v2.16b, v2.16b, v18.16b
     94 	ushr	v9.2d, v1.2d, #2
     95 	eor	v6.16b, v6.16b, v8.16b
     96 	ushr	v8.2d, v0.2d, #2
     97 	ushr	v10.2d, v5.2d, #2
     98 	ushr	v18.2d, v4.2d, #2
     99 	eor	v9.16b, v9.16b, v3.16b
    100 	eor	v8.16b, v8.16b, v2.16b
    101 	eor	v10.16b, v10.16b, v7.16b
    102 	eor	v18.16b, v18.16b, v6.16b
    103 	and	v9.16b, v9.16b, v16.16b
    104 	and	v8.16b, v8.16b, v16.16b
    105 	and	v10.16b, v10.16b, v16.16b
    106 	and	v16.16b, v18.16b, v16.16b
    107 	eor	v3.16b, v3.16b, v9.16b
    108 	shl	v9.2d, v9.2d, #2
    109 	eor	v2.16b, v2.16b, v8.16b
    110 	shl	v8.2d, v8.2d, #2
    111 	eor	v7.16b, v7.16b, v10.16b
    112 	shl	v10.2d, v10.2d, #2
    113 	eor	v6.16b, v6.16b, v16.16b
    114 	shl	v16.2d, v16.2d, #2
    115 	eor	v1.16b, v1.16b, v9.16b
    116 	eor	v0.16b, v0.16b, v8.16b
    117 	eor	v5.16b, v5.16b, v10.16b
    118 	eor	v4.16b, v4.16b, v16.16b
    119 	ushr	v8.2d, v3.2d, #4
    120 	ushr	v9.2d, v2.2d, #4
    121 	ushr	v10.2d, v1.2d, #4
    122 	ushr	v16.2d, v0.2d, #4
    123 	eor	v8.16b, v8.16b, v7.16b
    124 	eor	v9.16b, v9.16b, v6.16b
    125 	eor	v10.16b, v10.16b, v5.16b
    126 	eor	v16.16b, v16.16b, v4.16b
    127 	and	v8.16b, v8.16b, v17.16b
    128 	and	v9.16b, v9.16b, v17.16b
    129 	and	v10.16b, v10.16b, v17.16b
    130 	and	v16.16b, v16.16b, v17.16b
    131 	eor	v7.16b, v7.16b, v8.16b
    132 	shl	v8.2d, v8.2d, #4
    133 	eor	v6.16b, v6.16b, v9.16b
    134 	shl	v9.2d, v9.2d, #4
    135 	eor	v5.16b, v5.16b, v10.16b
    136 	shl	v10.2d, v10.2d, #4
    137 	eor	v4.16b, v4.16b, v16.16b
    138 	shl	v16.2d, v16.2d, #4
    139 	eor	v3.16b, v3.16b, v8.16b
    140 	eor	v2.16b, v2.16b, v9.16b
    141 	eor	v1.16b, v1.16b, v10.16b
    142 	eor	v0.16b, v0.16b, v16.16b
    143 	b	.Ldec_sbox
    144 .align	4
    145 .Ldec_loop:
    146 	ld1	{v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
    147 	ldp	q8, q9, [x9], #32
    148 	eor	v0.16b, v16.16b, v0.16b
    149 	ldr	q10, [x9], #16
    150 	eor	v1.16b, v17.16b, v1.16b
    151 	ldr	q16, [x9], #16
    152 	eor	v2.16b, v18.16b, v2.16b
    153 	eor	v3.16b, v19.16b, v3.16b
    154 	eor	v4.16b, v8.16b, v4.16b
    155 	eor	v5.16b, v9.16b, v5.16b
    156 	eor	v6.16b, v10.16b, v6.16b
    157 	eor	v7.16b, v16.16b, v7.16b
    158 	tbl	v0.16b, {v0.16b}, v28.16b
    159 	tbl	v1.16b, {v1.16b}, v28.16b
    160 	tbl	v2.16b, {v2.16b}, v28.16b
    161 	tbl	v3.16b, {v3.16b}, v28.16b
    162 	tbl	v4.16b, {v4.16b}, v28.16b
    163 	tbl	v5.16b, {v5.16b}, v28.16b
    164 	tbl	v6.16b, {v6.16b}, v28.16b
    165 	tbl	v7.16b, {v7.16b}, v28.16b
    166 .Ldec_sbox:
    167 	eor	v1.16b, v1.16b, v4.16b
    168 	eor	v3.16b, v3.16b, v4.16b
    169 	subs	x10, x10, #1
    170 	eor	v4.16b, v4.16b, v7.16b
    171 	eor	v2.16b, v2.16b, v7.16b
    172 	eor	v1.16b, v1.16b, v6.16b
    173 	eor	v6.16b, v6.16b, v4.16b
    174 	eor	v2.16b, v2.16b, v5.16b
    175 	eor	v0.16b, v0.16b, v1.16b
    176 	eor	v7.16b, v7.16b, v6.16b
    177 	eor	v8.16b, v6.16b, v2.16b
    178 	and	v9.16b, v4.16b, v6.16b
    179 	eor	v10.16b, v2.16b, v6.16b
    180 	eor	v3.16b, v3.16b, v0.16b
    181 	eor	v5.16b, v5.16b, v0.16b
    182 	eor	v16.16b, v7.16b, v4.16b
    183 	eor	v17.16b, v4.16b, v0.16b
    184 	and	v18.16b, v0.16b, v2.16b
    185 	eor	v19.16b, v7.16b, v4.16b
    186 	eor	v1.16b, v1.16b, v3.16b
    187 	eor	v20.16b, v3.16b, v0.16b
    188 	eor	v21.16b, v5.16b, v2.16b
    189 	eor	v22.16b, v3.16b, v7.16b
    190 	and	v8.16b, v17.16b, v8.16b
    191 	orr	v17.16b, v3.16b, v5.16b
    192 	eor	v23.16b, v1.16b, v6.16b
    193 	eor	v24.16b, v20.16b, v16.16b
    194 	eor	v25.16b, v1.16b, v5.16b
    195 	orr	v26.16b, v20.16b, v21.16b
    196 	and	v20.16b, v20.16b, v21.16b
    197 	and	v27.16b, v7.16b, v1.16b
    198 	eor	v21.16b, v21.16b, v23.16b
    199 	orr	v28.16b, v16.16b, v23.16b
    200 	orr	v29.16b, v22.16b, v25.16b
    201 	eor	v26.16b, v26.16b, v8.16b
    202 	and	v16.16b, v16.16b, v23.16b
    203 	and	v22.16b, v22.16b, v25.16b
    204 	and	v21.16b, v24.16b, v21.16b
    205 	eor	v8.16b, v28.16b, v8.16b
    206 	eor	v23.16b, v5.16b, v2.16b
    207 	eor	v24.16b, v1.16b, v6.16b
    208 	eor	v16.16b, v16.16b, v22.16b
    209 	eor	v22.16b, v3.16b, v0.16b
    210 	eor	v25.16b, v29.16b, v21.16b
    211 	eor	v21.16b, v26.16b, v21.16b
    212 	eor	v8.16b, v8.16b, v20.16b
    213 	eor	v26.16b, v23.16b, v24.16b
    214 	eor	v16.16b, v16.16b, v20.16b
    215 	eor	v28.16b, v22.16b, v19.16b
    216 	eor	v20.16b, v25.16b, v20.16b
    217 	eor	v9.16b, v21.16b, v9.16b
    218 	eor	v8.16b, v8.16b, v18.16b
    219 	eor	v18.16b, v5.16b, v1.16b
    220 	eor	v21.16b, v16.16b, v17.16b
    221 	eor	v16.16b, v16.16b, v17.16b
    222 	eor	v17.16b, v20.16b, v27.16b
    223 	eor	v20.16b, v3.16b, v7.16b
    224 	eor	v25.16b, v9.16b, v8.16b
    225 	eor	v27.16b, v0.16b, v4.16b
    226 	and	v29.16b, v9.16b, v17.16b
    227 	eor	v30.16b, v8.16b, v29.16b
    228 	eor	v31.16b, v21.16b, v29.16b
    229 	eor	v29.16b, v21.16b, v29.16b
    230 	bsl	v30.16b, v17.16b, v21.16b
    231 	bsl	v31.16b, v9.16b, v8.16b
    232 	bsl	v16.16b, v30.16b, v29.16b
    233 	bsl	v21.16b, v29.16b, v30.16b
    234 	eor	v8.16b, v31.16b, v30.16b
    235 	and	v1.16b, v1.16b, v31.16b
    236 	and	v9.16b, v16.16b, v31.16b
    237 	and	v6.16b, v6.16b, v30.16b
    238 	eor	v16.16b, v17.16b, v21.16b
    239 	and	v4.16b, v4.16b, v30.16b
    240 	eor	v17.16b, v8.16b, v30.16b
    241 	and	v21.16b, v24.16b, v8.16b
    242 	eor	v9.16b, v9.16b, v25.16b
    243 	and	v19.16b, v19.16b, v8.16b
    244 	eor	v24.16b, v30.16b, v16.16b
    245 	eor	v25.16b, v30.16b, v16.16b
    246 	and	v7.16b, v7.16b, v17.16b
    247 	and	v10.16b, v10.16b, v16.16b
    248 	eor	v29.16b, v9.16b, v16.16b
    249 	eor	v30.16b, v31.16b, v9.16b
    250 	and	v0.16b, v24.16b, v0.16b
    251 	and	v9.16b, v18.16b, v9.16b
    252 	and	v2.16b, v25.16b, v2.16b
    253 	eor	v10.16b, v10.16b, v6.16b
    254 	eor	v18.16b, v29.16b, v16.16b
    255 	and	v5.16b, v30.16b, v5.16b
    256 	eor	v24.16b, v8.16b, v29.16b
    257 	and	v25.16b, v26.16b, v29.16b
    258 	and	v26.16b, v28.16b, v29.16b
    259 	eor	v8.16b, v8.16b, v29.16b
    260 	eor	v17.16b, v17.16b, v18.16b
    261 	eor	v5.16b, v1.16b, v5.16b
    262 	and	v23.16b, v24.16b, v23.16b
    263 	eor	v21.16b, v21.16b, v25.16b
    264 	eor	v19.16b, v19.16b, v26.16b
    265 	eor	v0.16b, v4.16b, v0.16b
    266 	and	v3.16b, v17.16b, v3.16b
    267 	eor	v1.16b, v9.16b, v1.16b
    268 	eor	v9.16b, v25.16b, v23.16b
    269 	eor	v5.16b, v5.16b, v21.16b
    270 	eor	v2.16b, v6.16b, v2.16b
    271 	and	v6.16b, v8.16b, v22.16b
    272 	eor	v3.16b, v7.16b, v3.16b
    273 	and	v8.16b, v20.16b, v18.16b
    274 	eor	v10.16b, v10.16b, v9.16b
    275 	eor	v0.16b, v0.16b, v19.16b
    276 	eor	v9.16b, v1.16b, v9.16b
    277 	eor	v1.16b, v2.16b, v21.16b
    278 	eor	v3.16b, v3.16b, v19.16b
    279 	and	v16.16b, v27.16b, v16.16b
    280 	eor	v17.16b, v26.16b, v6.16b
    281 	eor	v6.16b, v8.16b, v7.16b
    282 	eor	v7.16b, v1.16b, v9.16b
    283 	eor	v1.16b, v5.16b, v3.16b
    284 	eor	v2.16b, v10.16b, v3.16b
    285 	eor	v4.16b, v16.16b, v4.16b
    286 	eor	v8.16b, v6.16b, v17.16b
    287 	eor	v5.16b, v9.16b, v3.16b
    288 	eor	v9.16b, v0.16b, v1.16b
    289 	eor	v6.16b, v7.16b, v1.16b
    290 	eor	v0.16b, v4.16b, v17.16b
    291 	eor	v4.16b, v8.16b, v7.16b
    292 	eor	v7.16b, v9.16b, v2.16b
    293 	eor	v8.16b, v3.16b, v0.16b
    294 	eor	v7.16b, v7.16b, v5.16b
    295 	eor	v3.16b, v4.16b, v7.16b
    296 	eor	v4.16b, v7.16b, v0.16b
    297 	eor	v7.16b, v8.16b, v3.16b
    298 	bcc	.Ldec_done
    299 	ext	v8.16b, v0.16b, v0.16b, #8
    300 	ext	v9.16b, v1.16b, v1.16b, #8
    301 	ldr	q28, [x11]                  // load from .LISR in common case (x10 > 0)
    302 	ext	v10.16b, v6.16b, v6.16b, #8
    303 	ext	v16.16b, v3.16b, v3.16b, #8
    304 	ext	v17.16b, v5.16b, v5.16b, #8
    305 	ext	v18.16b, v4.16b, v4.16b, #8
    306 	eor	v8.16b, v8.16b, v0.16b
    307 	eor	v9.16b, v9.16b, v1.16b
    308 	eor	v10.16b, v10.16b, v6.16b
    309 	eor	v16.16b, v16.16b, v3.16b
    310 	eor	v17.16b, v17.16b, v5.16b
    311 	ext	v19.16b, v2.16b, v2.16b, #8
    312 	ext	v20.16b, v7.16b, v7.16b, #8
    313 	eor	v18.16b, v18.16b, v4.16b
    314 	eor	v6.16b, v6.16b, v8.16b
    315 	eor	v8.16b, v2.16b, v10.16b
    316 	eor	v4.16b, v4.16b, v9.16b
    317 	eor	v2.16b, v19.16b, v2.16b
    318 	eor	v9.16b, v20.16b, v7.16b
    319 	eor	v0.16b, v0.16b, v16.16b
    320 	eor	v1.16b, v1.16b, v16.16b
    321 	eor	v6.16b, v6.16b, v17.16b
    322 	eor	v8.16b, v8.16b, v16.16b
    323 	eor	v7.16b, v7.16b, v18.16b
    324 	eor	v4.16b, v4.16b, v16.16b
    325 	eor	v2.16b, v3.16b, v2.16b
    326 	eor	v1.16b, v1.16b, v17.16b
    327 	eor	v3.16b, v5.16b, v9.16b
    328 	eor	v5.16b, v8.16b, v17.16b
    329 	eor	v7.16b, v7.16b, v17.16b
    330 	ext	v8.16b, v0.16b, v0.16b, #12
    331 	ext	v9.16b, v6.16b, v6.16b, #12
    332 	ext	v10.16b, v4.16b, v4.16b, #12
    333 	ext	v16.16b, v1.16b, v1.16b, #12
    334 	ext	v17.16b, v5.16b, v5.16b, #12
    335 	ext	v18.16b, v7.16b, v7.16b, #12
    336 	eor	v0.16b, v0.16b, v8.16b
    337 	eor	v6.16b, v6.16b, v9.16b
    338 	eor	v4.16b, v4.16b, v10.16b
    339 	ext	v19.16b, v2.16b, v2.16b, #12
    340 	ext	v20.16b, v3.16b, v3.16b, #12
    341 	eor	v1.16b, v1.16b, v16.16b
    342 	eor	v5.16b, v5.16b, v17.16b
    343 	eor	v7.16b, v7.16b, v18.16b
    344 	eor	v2.16b, v2.16b, v19.16b
    345 	eor	v16.16b, v16.16b, v0.16b
    346 	eor	v3.16b, v3.16b, v20.16b
    347 	eor	v17.16b, v17.16b, v4.16b
    348 	eor	v10.16b, v10.16b, v6.16b
    349 	ext	v0.16b, v0.16b, v0.16b, #8
    350 	eor	v9.16b, v9.16b, v1.16b
    351 	ext	v1.16b, v1.16b, v1.16b, #8
    352 	eor	v8.16b, v8.16b, v3.16b
    353 	eor	v16.16b, v16.16b, v3.16b
    354 	eor	v18.16b, v18.16b, v5.16b
    355 	eor	v19.16b, v19.16b, v7.16b
    356 	ext	v21.16b, v5.16b, v5.16b, #8
    357 	ext	v5.16b, v7.16b, v7.16b, #8
    358 	eor	v7.16b, v20.16b, v2.16b
    359 	ext	v4.16b, v4.16b, v4.16b, #8
    360 	ext	v20.16b, v3.16b, v3.16b, #8
    361 	eor	v17.16b, v17.16b, v3.16b
    362 	ext	v2.16b, v2.16b, v2.16b, #8
    363 	eor	v3.16b, v10.16b, v3.16b
    364 	ext	v10.16b, v6.16b, v6.16b, #8
    365 	eor	v0.16b, v0.16b, v8.16b
    366 	eor	v1.16b, v1.16b, v16.16b
    367 	eor	v5.16b, v5.16b, v18.16b
    368 	eor	v3.16b, v3.16b, v4.16b
    369 	eor	v7.16b, v20.16b, v7.16b
    370 	eor	v6.16b, v2.16b, v19.16b
    371 	eor	v4.16b, v21.16b, v17.16b
    372 	eor	v2.16b, v10.16b, v9.16b
    373 	bne	.Ldec_loop
    374 	ldr	q28, [x11, #16]!            // load from .LISRM0 on last round (x10 == 0)
    375 	b	.Ldec_loop
    376 .align	4
    377 .Ldec_done:
    378 	ushr	v8.2d, v0.2d, #1
    379 	movi	v9.16b, #0x55
    380 	ldr	q10, [x9]
    381 	ushr	v16.2d, v2.2d, #1
    382 	movi	v17.16b, #0x33
    383 	ushr	v18.2d, v6.2d, #1
    384 	movi	v19.16b, #0x0f
    385 	eor	v8.16b, v8.16b, v1.16b
    386 	ushr	v20.2d, v3.2d, #1
    387 	eor	v16.16b, v16.16b, v7.16b
    388 	eor	v18.16b, v18.16b, v4.16b
    389 	and	v8.16b, v8.16b, v9.16b
    390 	eor	v20.16b, v20.16b, v5.16b
    391 	and	v16.16b, v16.16b, v9.16b
    392 	and	v18.16b, v18.16b, v9.16b
    393 	shl	v21.2d, v8.2d, #1
    394 	eor	v1.16b, v1.16b, v8.16b
    395 	and	v8.16b, v20.16b, v9.16b
    396 	eor	v7.16b, v7.16b, v16.16b
    397 	shl	v9.2d, v16.2d, #1
    398 	eor	v4.16b, v4.16b, v18.16b
    399 	shl	v16.2d, v18.2d, #1
    400 	eor	v0.16b, v0.16b, v21.16b
    401 	shl	v18.2d, v8.2d, #1
    402 	eor	v5.16b, v5.16b, v8.16b
    403 	eor	v2.16b, v2.16b, v9.16b
    404 	eor	v6.16b, v6.16b, v16.16b
    405 	ushr	v8.2d, v1.2d, #2
    406 	eor	v3.16b, v3.16b, v18.16b
    407 	ushr	v9.2d, v0.2d, #2
    408 	ushr	v16.2d, v7.2d, #2
    409 	ushr	v18.2d, v2.2d, #2
    410 	eor	v8.16b, v8.16b, v4.16b
    411 	eor	v9.16b, v9.16b, v6.16b
    412 	eor	v16.16b, v16.16b, v5.16b
    413 	eor	v18.16b, v18.16b, v3.16b
    414 	and	v8.16b, v8.16b, v17.16b
    415 	and	v9.16b, v9.16b, v17.16b
    416 	and	v16.16b, v16.16b, v17.16b
    417 	and	v17.16b, v18.16b, v17.16b
    418 	eor	v4.16b, v4.16b, v8.16b
    419 	shl	v8.2d, v8.2d, #2
    420 	eor	v6.16b, v6.16b, v9.16b
    421 	shl	v9.2d, v9.2d, #2
    422 	eor	v5.16b, v5.16b, v16.16b
    423 	shl	v16.2d, v16.2d, #2
    424 	eor	v3.16b, v3.16b, v17.16b
    425 	shl	v17.2d, v17.2d, #2
    426 	eor	v1.16b, v1.16b, v8.16b
    427 	eor	v0.16b, v0.16b, v9.16b
    428 	eor	v7.16b, v7.16b, v16.16b
    429 	eor	v2.16b, v2.16b, v17.16b
    430 	ushr	v8.2d, v4.2d, #4
    431 	ushr	v9.2d, v6.2d, #4
    432 	ushr	v16.2d, v1.2d, #4
    433 	ushr	v17.2d, v0.2d, #4
    434 	eor	v8.16b, v8.16b, v5.16b
    435 	eor	v9.16b, v9.16b, v3.16b
    436 	eor	v16.16b, v16.16b, v7.16b
    437 	eor	v17.16b, v17.16b, v2.16b
    438 	and	v8.16b, v8.16b, v19.16b
    439 	and	v9.16b, v9.16b, v19.16b
    440 	and	v16.16b, v16.16b, v19.16b
    441 	and	v17.16b, v17.16b, v19.16b
    442 	eor	v5.16b, v5.16b, v8.16b
    443 	shl	v8.2d, v8.2d, #4
    444 	eor	v3.16b, v3.16b, v9.16b
    445 	shl	v9.2d, v9.2d, #4
    446 	eor	v7.16b, v7.16b, v16.16b
    447 	shl	v16.2d, v16.2d, #4
    448 	eor	v2.16b, v2.16b, v17.16b
    449 	shl	v17.2d, v17.2d, #4
    450 	eor	v4.16b, v4.16b, v8.16b
    451 	eor	v6.16b, v6.16b, v9.16b
    452 	eor	v7.16b, v7.16b, v10.16b
    453 	eor	v1.16b, v1.16b, v16.16b
    454 	eor	v2.16b, v2.16b, v10.16b
    455 	eor	v0.16b, v0.16b, v17.16b
    456 	eor	v4.16b, v4.16b, v10.16b
    457 	eor	v6.16b, v6.16b, v10.16b
    458 	eor	v3.16b, v3.16b, v10.16b
    459 	eor	v5.16b, v5.16b, v10.16b
    460 	eor	v1.16b, v1.16b, v10.16b
    461 	eor	v0.16b, v0.16b, v10.16b
    462 	ret
    463 .size	_bsaes_decrypt8,.-_bsaes_decrypt8
    464 
    465 .section	.rodata
    466 .type	_bsaes_consts,%object
    467 .align	6
    468 _bsaes_consts:
    469 // InvShiftRows constants
    470 // Used in _bsaes_decrypt8, which assumes contiguity
    471 // .LM0ISR used with round 0 key
    472 // .LISR   used with middle round keys
    473 // .LISRM0 used with final round key
    474 .LM0ISR:
    475 .quad	0x0a0e0206070b0f03, 0x0004080c0d010509
    476 .LISR:
    477 .quad	0x0504070602010003, 0x0f0e0d0c080b0a09
    478 .LISRM0:
    479 .quad	0x01040b0e0205080f, 0x0306090c00070a0d
    480 
    481 // ShiftRows constants
    482 // Used in _bsaes_encrypt8, which assumes contiguity
    483 // .LM0SR used with round 0 key
    484 // .LSR   used with middle round keys
    485 // .LSRM0 used with final round key
    486 .LM0SR:
    487 .quad	0x0a0e02060f03070b, 0x0004080c05090d01
    488 .LSR:
    489 .quad	0x0504070600030201, 0x0f0e0d0c0a09080b
    490 .LSRM0:
    491 .quad	0x0304090e00050a0f, 0x01060b0c0207080d
    492 
    493 .LM0_bigendian:
    494 .quad	0x02060a0e03070b0f, 0x0004080c0105090d
    495 .LM0_littleendian:
    496 .quad	0x0105090d0004080c, 0x03070b0f02060a0e
    497 
    498 // Used in ossl_bsaes_ctr32_encrypt_blocks, prior to dropping into
    499 // _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR
    500 .LREVM0SR:
    501 .quad	0x090d01050c000408, 0x03070b0f060a0e02
    502 
    503 .align	6
    504 .size	_bsaes_consts,.-_bsaes_consts
    505 
    506 .previous
    507 
    508 .type	_bsaes_encrypt8,%function
    509 .align	4
    510 // On entry:
    511 //   x9 -> key (previously expanded using _bsaes_key_convert)
    512 //   x10 = number of rounds
    513 //   v0-v7 input data
    514 // On exit:
    515 //   x9-x11 corrupted
    516 //   other general-purpose registers preserved
    517 //   v0-v7 output data
    518 //   v11-v15 preserved
    519 //   other SIMD registers corrupted
    520 _bsaes_encrypt8:
    521 	ldr	q8, [x9], #16
    522 	adrp	x11, .LM0SR
    523 	add	x11, x11, #:lo12:.LM0SR
    524 	ldr	q9, [x11], #16
    525 _bsaes_encrypt8_alt:
    526 	eor	v0.16b, v0.16b, v8.16b
    527 	eor	v1.16b, v1.16b, v8.16b
    528 	sub	x10, x10, #1
    529 	eor	v2.16b, v2.16b, v8.16b
    530 	eor	v4.16b, v4.16b, v8.16b
    531 	eor	v3.16b, v3.16b, v8.16b
    532 	eor	v5.16b, v5.16b, v8.16b
    533 	tbl	v0.16b, {v0.16b}, v9.16b
    534 	tbl	v1.16b, {v1.16b}, v9.16b
    535 	tbl	v2.16b, {v2.16b}, v9.16b
    536 	tbl	v4.16b, {v4.16b}, v9.16b
    537 	eor	v6.16b, v6.16b, v8.16b
    538 	eor	v7.16b, v7.16b, v8.16b
    539 	tbl	v3.16b, {v3.16b}, v9.16b
    540 	tbl	v5.16b, {v5.16b}, v9.16b
    541 	tbl	v6.16b, {v6.16b}, v9.16b
    542 	ushr	v8.2d, v0.2d, #1
    543 	movi	v10.16b, #0x55
    544 	tbl	v7.16b, {v7.16b}, v9.16b
    545 	ushr	v9.2d, v4.2d, #1
    546 	movi	v16.16b, #0x33
    547 	ushr	v17.2d, v2.2d, #1
    548 	eor	v8.16b, v8.16b, v1.16b
    549 	movi	v18.16b, #0x0f
    550 	ushr	v19.2d, v6.2d, #1
    551 	eor	v9.16b, v9.16b, v5.16b
    552 	eor	v17.16b, v17.16b, v3.16b
    553 	and	v8.16b, v8.16b, v10.16b
    554 	eor	v19.16b, v19.16b, v7.16b
    555 	and	v9.16b, v9.16b, v10.16b
    556 	and	v17.16b, v17.16b, v10.16b
    557 	eor	v1.16b, v1.16b, v8.16b
    558 	shl	v8.2d, v8.2d, #1
    559 	and	v10.16b, v19.16b, v10.16b
    560 	eor	v5.16b, v5.16b, v9.16b
    561 	shl	v9.2d, v9.2d, #1
    562 	eor	v3.16b, v3.16b, v17.16b
    563 	shl	v17.2d, v17.2d, #1
    564 	eor	v0.16b, v0.16b, v8.16b
    565 	shl	v8.2d, v10.2d, #1
    566 	eor	v7.16b, v7.16b, v10.16b
    567 	eor	v4.16b, v4.16b, v9.16b
    568 	eor	v2.16b, v2.16b, v17.16b
    569 	ushr	v9.2d, v1.2d, #2
    570 	eor	v6.16b, v6.16b, v8.16b
    571 	ushr	v8.2d, v0.2d, #2
    572 	ushr	v10.2d, v5.2d, #2
    573 	ushr	v17.2d, v4.2d, #2
    574 	eor	v9.16b, v9.16b, v3.16b
    575 	eor	v8.16b, v8.16b, v2.16b
    576 	eor	v10.16b, v10.16b, v7.16b
    577 	eor	v17.16b, v17.16b, v6.16b
    578 	and	v9.16b, v9.16b, v16.16b
    579 	and	v8.16b, v8.16b, v16.16b
    580 	and	v10.16b, v10.16b, v16.16b
    581 	and	v16.16b, v17.16b, v16.16b
    582 	eor	v3.16b, v3.16b, v9.16b
    583 	shl	v9.2d, v9.2d, #2
    584 	eor	v2.16b, v2.16b, v8.16b
    585 	shl	v8.2d, v8.2d, #2
    586 	eor	v7.16b, v7.16b, v10.16b
    587 	shl	v10.2d, v10.2d, #2
    588 	eor	v6.16b, v6.16b, v16.16b
    589 	shl	v16.2d, v16.2d, #2
    590 	eor	v1.16b, v1.16b, v9.16b
    591 	eor	v0.16b, v0.16b, v8.16b
    592 	eor	v5.16b, v5.16b, v10.16b
    593 	eor	v4.16b, v4.16b, v16.16b
    594 	ushr	v8.2d, v3.2d, #4
    595 	ushr	v9.2d, v2.2d, #4
    596 	ushr	v10.2d, v1.2d, #4
    597 	ushr	v16.2d, v0.2d, #4
    598 	eor	v8.16b, v8.16b, v7.16b
    599 	eor	v9.16b, v9.16b, v6.16b
    600 	eor	v10.16b, v10.16b, v5.16b
    601 	eor	v16.16b, v16.16b, v4.16b
    602 	and	v8.16b, v8.16b, v18.16b
    603 	and	v9.16b, v9.16b, v18.16b
    604 	and	v10.16b, v10.16b, v18.16b
    605 	and	v16.16b, v16.16b, v18.16b
    606 	eor	v7.16b, v7.16b, v8.16b
    607 	shl	v8.2d, v8.2d, #4
    608 	eor	v6.16b, v6.16b, v9.16b
    609 	shl	v9.2d, v9.2d, #4
    610 	eor	v5.16b, v5.16b, v10.16b
    611 	shl	v10.2d, v10.2d, #4
    612 	eor	v4.16b, v4.16b, v16.16b
    613 	shl	v16.2d, v16.2d, #4
    614 	eor	v3.16b, v3.16b, v8.16b
    615 	eor	v2.16b, v2.16b, v9.16b
    616 	eor	v1.16b, v1.16b, v10.16b
    617 	eor	v0.16b, v0.16b, v16.16b
    618 	b	.Lenc_sbox
    619 .align	4
    620 .Lenc_loop:
    621 	ld1	{v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
    622 	ldp	q8, q9, [x9], #32
    623 	eor	v0.16b, v16.16b, v0.16b
    624 	ldr	q10, [x9], #16
    625 	eor	v1.16b, v17.16b, v1.16b
    626 	ldr	q16, [x9], #16
    627 	eor	v2.16b, v18.16b, v2.16b
    628 	eor	v3.16b, v19.16b, v3.16b
    629 	eor	v4.16b, v8.16b, v4.16b
    630 	eor	v5.16b, v9.16b, v5.16b
    631 	eor	v6.16b, v10.16b, v6.16b
    632 	eor	v7.16b, v16.16b, v7.16b
    633 	tbl	v0.16b, {v0.16b}, v28.16b
    634 	tbl	v1.16b, {v1.16b}, v28.16b
    635 	tbl	v2.16b, {v2.16b}, v28.16b
    636 	tbl	v3.16b, {v3.16b}, v28.16b
    637 	tbl	v4.16b, {v4.16b}, v28.16b
    638 	tbl	v5.16b, {v5.16b}, v28.16b
    639 	tbl	v6.16b, {v6.16b}, v28.16b
    640 	tbl	v7.16b, {v7.16b}, v28.16b
    641 .Lenc_sbox:
    642 	eor	v5.16b, v5.16b, v6.16b
    643 	eor	v3.16b, v3.16b, v0.16b
    644 	subs	x10, x10, #1
    645 	eor	v2.16b, v2.16b, v1.16b
    646 	eor	v5.16b, v5.16b, v0.16b
    647 	eor	v8.16b, v3.16b, v7.16b
    648 	eor	v6.16b, v6.16b, v2.16b
    649 	eor	v7.16b, v7.16b, v5.16b
    650 	eor	v8.16b, v8.16b, v4.16b
    651 	eor	v3.16b, v6.16b, v3.16b
    652 	eor	v4.16b, v4.16b, v5.16b
    653 	eor	v6.16b, v1.16b, v5.16b
    654 	eor	v2.16b, v2.16b, v7.16b
    655 	eor	v1.16b, v8.16b, v1.16b
    656 	eor	v8.16b, v7.16b, v4.16b
    657 	eor	v9.16b, v3.16b, v0.16b
    658 	eor	v10.16b, v7.16b, v6.16b
    659 	eor	v16.16b, v5.16b, v3.16b
    660 	eor	v17.16b, v6.16b, v2.16b
    661 	eor	v18.16b, v5.16b, v1.16b
    662 	eor	v19.16b, v2.16b, v4.16b
    663 	eor	v20.16b, v1.16b, v0.16b
    664 	orr	v21.16b, v8.16b, v9.16b
    665 	orr	v22.16b, v10.16b, v16.16b
    666 	eor	v23.16b, v8.16b, v17.16b
    667 	eor	v24.16b, v9.16b, v18.16b
    668 	and	v19.16b, v19.16b, v20.16b
    669 	orr	v20.16b, v17.16b, v18.16b
    670 	and	v8.16b, v8.16b, v9.16b
    671 	and	v9.16b, v17.16b, v18.16b
    672 	and	v17.16b, v23.16b, v24.16b
    673 	and	v10.16b, v10.16b, v16.16b
    674 	eor	v16.16b, v21.16b, v19.16b
    675 	eor	v18.16b, v20.16b, v19.16b
    676 	and	v19.16b, v2.16b, v1.16b
    677 	and	v20.16b, v6.16b, v5.16b
    678 	eor	v21.16b, v22.16b, v17.16b
    679 	eor	v9.16b, v9.16b, v10.16b
    680 	eor	v10.16b, v16.16b, v17.16b
    681 	eor	v16.16b, v18.16b, v8.16b
    682 	and	v17.16b, v4.16b, v0.16b
    683 	orr	v18.16b, v7.16b, v3.16b
    684 	eor	v21.16b, v21.16b, v8.16b
    685 	eor	v8.16b, v9.16b, v8.16b
    686 	eor	v9.16b, v10.16b, v19.16b
    687 	eor	v10.16b, v3.16b, v0.16b
    688 	eor	v16.16b, v16.16b, v17.16b
    689 	eor	v17.16b, v5.16b, v1.16b
    690 	eor	v19.16b, v21.16b, v20.16b
    691 	eor	v20.16b, v8.16b, v18.16b
    692 	eor	v8.16b, v8.16b, v18.16b
    693 	eor	v18.16b, v7.16b, v4.16b
    694 	eor	v21.16b, v9.16b, v16.16b
    695 	eor	v22.16b, v6.16b, v2.16b
    696 	and	v23.16b, v9.16b, v19.16b
    697 	eor	v24.16b, v10.16b, v17.16b
    698 	eor	v25.16b, v0.16b, v1.16b
    699 	eor	v26.16b, v7.16b, v6.16b
    700 	eor	v27.16b, v18.16b, v22.16b
    701 	eor	v28.16b, v3.16b, v5.16b
    702 	eor	v29.16b, v16.16b, v23.16b
    703 	eor	v30.16b, v20.16b, v23.16b
    704 	eor	v23.16b, v20.16b, v23.16b
    705 	eor	v31.16b, v4.16b, v2.16b
    706 	bsl	v29.16b, v19.16b, v20.16b
    707 	bsl	v30.16b, v9.16b, v16.16b
    708 	bsl	v8.16b, v29.16b, v23.16b
    709 	bsl	v20.16b, v23.16b, v29.16b
    710 	eor	v9.16b, v30.16b, v29.16b
    711 	and	v5.16b, v5.16b, v30.16b
    712 	and	v8.16b, v8.16b, v30.16b
    713 	and	v1.16b, v1.16b, v29.16b
    714 	eor	v16.16b, v19.16b, v20.16b
    715 	and	v2.16b, v2.16b, v29.16b
    716 	eor	v19.16b, v9.16b, v29.16b
    717 	and	v17.16b, v17.16b, v9.16b
    718 	eor	v8.16b, v8.16b, v21.16b
    719 	and	v20.16b, v22.16b, v9.16b
    720 	eor	v21.16b, v29.16b, v16.16b
    721 	eor	v22.16b, v29.16b, v16.16b
    722 	and	v23.16b, v25.16b, v16.16b
    723 	and	v6.16b, v6.16b, v19.16b
    724 	eor	v25.16b, v8.16b, v16.16b
    725 	eor	v29.16b, v30.16b, v8.16b
    726 	and	v4.16b, v21.16b, v4.16b
    727 	and	v8.16b, v28.16b, v8.16b
    728 	and	v0.16b, v22.16b, v0.16b
    729 	eor	v21.16b, v23.16b, v1.16b
    730 	eor	v22.16b, v9.16b, v25.16b
    731 	eor	v9.16b, v9.16b, v25.16b
    732 	eor	v23.16b, v25.16b, v16.16b
    733 	and	v3.16b, v29.16b, v3.16b
    734 	and	v24.16b, v24.16b, v25.16b
    735 	and	v25.16b, v27.16b, v25.16b
    736 	and	v10.16b, v22.16b, v10.16b
    737 	and	v9.16b, v9.16b, v18.16b
    738 	eor	v18.16b, v19.16b, v23.16b
    739 	and	v19.16b, v26.16b, v23.16b
    740 	eor	v3.16b, v5.16b, v3.16b
    741 	eor	v17.16b, v17.16b, v24.16b
    742 	eor	v10.16b, v24.16b, v10.16b
    743 	and	v16.16b, v31.16b, v16.16b
    744 	eor	v20.16b, v20.16b, v25.16b
    745 	eor	v9.16b, v25.16b, v9.16b
    746 	eor	v4.16b, v2.16b, v4.16b
    747 	and	v7.16b, v18.16b, v7.16b
    748 	eor	v18.16b, v19.16b, v6.16b
    749 	eor	v5.16b, v8.16b, v5.16b
    750 	eor	v0.16b, v1.16b, v0.16b
    751 	eor	v1.16b, v21.16b, v10.16b
    752 	eor	v8.16b, v3.16b, v17.16b
    753 	eor	v2.16b, v16.16b, v2.16b
    754 	eor	v3.16b, v6.16b, v7.16b
    755 	eor	v6.16b, v18.16b, v9.16b
    756 	eor	v4.16b, v4.16b, v20.16b
    757 	eor	v10.16b, v5.16b, v10.16b
    758 	eor	v0.16b, v0.16b, v17.16b
    759 	eor	v9.16b, v2.16b, v9.16b
    760 	eor	v3.16b, v3.16b, v20.16b
    761 	eor	v7.16b, v6.16b, v1.16b
    762 	eor	v5.16b, v8.16b, v4.16b
    763 	eor	v6.16b, v10.16b, v1.16b
    764 	eor	v2.16b, v4.16b, v0.16b
    765 	eor	v4.16b, v3.16b, v10.16b
    766 	eor	v9.16b, v9.16b, v7.16b
    767 	eor	v3.16b, v0.16b, v5.16b
    768 	eor	v0.16b, v1.16b, v4.16b
    769 	eor	v1.16b, v4.16b, v8.16b
    770 	eor	v4.16b, v9.16b, v5.16b
    771 	eor	v6.16b, v6.16b, v3.16b
    772 	bcc	.Lenc_done
    773 	ext	v8.16b, v0.16b, v0.16b, #12
    774 	ext	v9.16b, v4.16b, v4.16b, #12
    775 	ldr	q28, [x11]
    776 	ext	v10.16b, v6.16b, v6.16b, #12
    777 	ext	v16.16b, v1.16b, v1.16b, #12
    778 	ext	v17.16b, v3.16b, v3.16b, #12
    779 	ext	v18.16b, v7.16b, v7.16b, #12
    780 	eor	v0.16b, v0.16b, v8.16b
    781 	eor	v4.16b, v4.16b, v9.16b
    782 	eor	v6.16b, v6.16b, v10.16b
    783 	ext	v19.16b, v2.16b, v2.16b, #12
    784 	ext	v20.16b, v5.16b, v5.16b, #12
    785 	eor	v1.16b, v1.16b, v16.16b
    786 	eor	v3.16b, v3.16b, v17.16b
    787 	eor	v7.16b, v7.16b, v18.16b
    788 	eor	v2.16b, v2.16b, v19.16b
    789 	eor	v16.16b, v16.16b, v0.16b
    790 	eor	v5.16b, v5.16b, v20.16b
    791 	eor	v17.16b, v17.16b, v6.16b
    792 	eor	v10.16b, v10.16b, v4.16b
    793 	ext	v0.16b, v0.16b, v0.16b, #8
    794 	eor	v9.16b, v9.16b, v1.16b
    795 	ext	v1.16b, v1.16b, v1.16b, #8
    796 	eor	v8.16b, v8.16b, v5.16b
    797 	eor	v16.16b, v16.16b, v5.16b
    798 	eor	v18.16b, v18.16b, v3.16b
    799 	eor	v19.16b, v19.16b, v7.16b
    800 	ext	v3.16b, v3.16b, v3.16b, #8
    801 	ext	v7.16b, v7.16b, v7.16b, #8
    802 	eor	v20.16b, v20.16b, v2.16b
    803 	ext	v6.16b, v6.16b, v6.16b, #8
    804 	ext	v21.16b, v5.16b, v5.16b, #8
    805 	eor	v17.16b, v17.16b, v5.16b
    806 	ext	v2.16b, v2.16b, v2.16b, #8
    807 	eor	v10.16b, v10.16b, v5.16b
    808 	ext	v22.16b, v4.16b, v4.16b, #8
    809 	eor	v0.16b, v0.16b, v8.16b
    810 	eor	v1.16b, v1.16b, v16.16b
    811 	eor	v5.16b, v7.16b, v18.16b
    812 	eor	v4.16b, v3.16b, v17.16b
    813 	eor	v3.16b, v6.16b, v10.16b
    814 	eor	v7.16b, v21.16b, v20.16b
    815 	eor	v6.16b, v2.16b, v19.16b
    816 	eor	v2.16b, v22.16b, v9.16b
    817 	bne	.Lenc_loop
    818 	ldr	q28, [x11, #16]!            // load from .LSRM0 on last round (x10 == 0)
    819 	b	.Lenc_loop
    820 .align	4
    821 .Lenc_done:
    822 	ushr	v8.2d, v0.2d, #1
    823 	movi	v9.16b, #0x55
    824 	ldr	q10, [x9]
    825 	ushr	v16.2d, v3.2d, #1
    826 	movi	v17.16b, #0x33
    827 	ushr	v18.2d, v4.2d, #1
    828 	movi	v19.16b, #0x0f
    829 	eor	v8.16b, v8.16b, v1.16b
    830 	ushr	v20.2d, v2.2d, #1
    831 	eor	v16.16b, v16.16b, v7.16b
    832 	eor	v18.16b, v18.16b, v6.16b
    833 	and	v8.16b, v8.16b, v9.16b
    834 	eor	v20.16b, v20.16b, v5.16b
    835 	and	v16.16b, v16.16b, v9.16b
    836 	and	v18.16b, v18.16b, v9.16b
    837 	shl	v21.2d, v8.2d, #1
    838 	eor	v1.16b, v1.16b, v8.16b
    839 	and	v8.16b, v20.16b, v9.16b
    840 	eor	v7.16b, v7.16b, v16.16b
    841 	shl	v9.2d, v16.2d, #1
    842 	eor	v6.16b, v6.16b, v18.16b
    843 	shl	v16.2d, v18.2d, #1
    844 	eor	v0.16b, v0.16b, v21.16b
    845 	shl	v18.2d, v8.2d, #1
    846 	eor	v5.16b, v5.16b, v8.16b
    847 	eor	v3.16b, v3.16b, v9.16b
    848 	eor	v4.16b, v4.16b, v16.16b
    849 	ushr	v8.2d, v1.2d, #2
    850 	eor	v2.16b, v2.16b, v18.16b
    851 	ushr	v9.2d, v0.2d, #2
    852 	ushr	v16.2d, v7.2d, #2
    853 	ushr	v18.2d, v3.2d, #2
    854 	eor	v8.16b, v8.16b, v6.16b
    855 	eor	v9.16b, v9.16b, v4.16b
    856 	eor	v16.16b, v16.16b, v5.16b
    857 	eor	v18.16b, v18.16b, v2.16b
    858 	and	v8.16b, v8.16b, v17.16b
    859 	and	v9.16b, v9.16b, v17.16b
    860 	and	v16.16b, v16.16b, v17.16b
    861 	and	v17.16b, v18.16b, v17.16b
    862 	eor	v6.16b, v6.16b, v8.16b
    863 	shl	v8.2d, v8.2d, #2
    864 	eor	v4.16b, v4.16b, v9.16b
    865 	shl	v9.2d, v9.2d, #2
    866 	eor	v5.16b, v5.16b, v16.16b
    867 	shl	v16.2d, v16.2d, #2
    868 	eor	v2.16b, v2.16b, v17.16b
    869 	shl	v17.2d, v17.2d, #2
    870 	eor	v1.16b, v1.16b, v8.16b
    871 	eor	v0.16b, v0.16b, v9.16b
    872 	eor	v7.16b, v7.16b, v16.16b
    873 	eor	v3.16b, v3.16b, v17.16b
    874 	ushr	v8.2d, v6.2d, #4
    875 	ushr	v9.2d, v4.2d, #4
    876 	ushr	v16.2d, v1.2d, #4
    877 	ushr	v17.2d, v0.2d, #4
    878 	eor	v8.16b, v8.16b, v5.16b
    879 	eor	v9.16b, v9.16b, v2.16b
    880 	eor	v16.16b, v16.16b, v7.16b
    881 	eor	v17.16b, v17.16b, v3.16b
    882 	and	v8.16b, v8.16b, v19.16b
    883 	and	v9.16b, v9.16b, v19.16b
    884 	and	v16.16b, v16.16b, v19.16b
    885 	and	v17.16b, v17.16b, v19.16b
    886 	eor	v5.16b, v5.16b, v8.16b
    887 	shl	v8.2d, v8.2d, #4
    888 	eor	v2.16b, v2.16b, v9.16b
    889 	shl	v9.2d, v9.2d, #4
    890 	eor	v7.16b, v7.16b, v16.16b
    891 	shl	v16.2d, v16.2d, #4
    892 	eor	v3.16b, v3.16b, v17.16b
    893 	shl	v17.2d, v17.2d, #4
    894 	eor	v6.16b, v6.16b, v8.16b
    895 	eor	v4.16b, v4.16b, v9.16b
    896 	eor	v7.16b, v7.16b, v10.16b
    897 	eor	v1.16b, v1.16b, v16.16b
    898 	eor	v3.16b, v3.16b, v10.16b
    899 	eor	v0.16b, v0.16b, v17.16b
    900 	eor	v6.16b, v6.16b, v10.16b
    901 	eor	v4.16b, v4.16b, v10.16b
    902 	eor	v2.16b, v2.16b, v10.16b
    903 	eor	v5.16b, v5.16b, v10.16b
    904 	eor	v1.16b, v1.16b, v10.16b
    905 	eor	v0.16b, v0.16b, v10.16b
    906 	ret
    907 .size	_bsaes_encrypt8,.-_bsaes_encrypt8
    908 
    909 .type	_bsaes_key_convert,%function
    910 .align	4
    911 // On entry:
    912 //   x9 -> input key (big-endian)
    913 //   x10 = number of rounds
    914 //   x17 -> output key (native endianness)
    915 // On exit:
    916 //   x9, x10 corrupted
    917 //   x11 -> .LM0_bigendian
    918 //   x17 -> last quadword of output key
    919 //   other general-purpose registers preserved
    920 //   v2-v6 preserved
    921 //   v7.16b[] = 0x63
    922 //   v8-v14 preserved
    923 //   v15 = last round key (converted to native endianness)
    924 //   other SIMD registers corrupted
    925 _bsaes_key_convert:
    926 #ifdef __AARCH64EL__
    927 	adrp	x11, .LM0_littleendian
    928 	add	x11, x11, #:lo12:.LM0_littleendian
    929 #else
    930 	adrp	x11, .LM0_bigendian
    931 	add	x11, x11, #:lo12:.LM0_bigendian
    932 #endif
    933 	ldr	q0, [x9], #16               // load round 0 key
    934 	ldr	q1, [x11]                   // .LM0
    935 	ldr	q15, [x9], #16              // load round 1 key
    936 
    937 	movi	v7.16b, #0x63               // compose .L63
    938 	movi	v16.16b, #0x01              // bit masks
    939 	movi	v17.16b, #0x02
    940 	movi	v18.16b, #0x04
    941 	movi	v19.16b, #0x08
    942 	movi	v20.16b, #0x10
    943 	movi	v21.16b, #0x20
    944 	movi	v22.16b, #0x40
    945 	movi	v23.16b, #0x80
    946 
    947 #ifdef __AARCH64EL__
    948 	rev32	v0.16b, v0.16b
    949 #endif
    950 	sub	x10, x10, #1
    951 	str	q0, [x17], #16              // save round 0 key
    952 
    953 .align	4
    954 .Lkey_loop:
    955 	tbl	v0.16b, {v15.16b}, v1.16b
    956 	ldr	q15, [x9], #16              // load next round key
    957 
    958 	eor	v0.16b, v0.16b, v7.16b
    959 	cmtst	v24.16b, v0.16b, v16.16b
    960 	cmtst	v25.16b, v0.16b, v17.16b
    961 	cmtst	v26.16b, v0.16b, v18.16b
    962 	cmtst	v27.16b, v0.16b, v19.16b
    963 	cmtst	v28.16b, v0.16b, v20.16b
    964 	cmtst	v29.16b, v0.16b, v21.16b
    965 	cmtst	v30.16b, v0.16b, v22.16b
    966 	cmtst	v31.16b, v0.16b, v23.16b
    967 	sub	x10, x10, #1
    968 	st1	{v24.16b,v25.16b,v26.16b,v27.16b}, [x17], #64 // write bit-sliced round key
    969 	st1	{v28.16b,v29.16b,v30.16b,v31.16b}, [x17], #64
    970 	cbnz	x10, .Lkey_loop
    971 
    972         // don't save last round key
    973 #ifdef __AARCH64EL__
    974 	rev32	v15.16b, v15.16b
    975 	adrp	x11, .LM0_bigendian
    976 	add	x11, x11, #:lo12:.LM0_bigendian
    977 #endif
    978 	ret
    979 .size	_bsaes_key_convert,.-_bsaes_key_convert
    980 
    981 .globl	ossl_bsaes_cbc_encrypt
    982 .type	ossl_bsaes_cbc_encrypt,%function
    983 .align	4
    984 // On entry:
    985 //   x0 -> input ciphertext
    986 //   x1 -> output plaintext
    987 //   x2 = size of ciphertext and plaintext in bytes (assumed a multiple of 16)
    988 //   x3 -> key
    989 //   x4 -> 128-bit initialisation vector (or preceding 128-bit block of ciphertext if continuing after an earlier call)
    990 //   w5 must be == 0
    991 // On exit:
    992 //   Output plaintext filled in
    993 //   Initialisation vector overwritten with last quadword of ciphertext
    994 //   No output registers, usual AAPCS64 register preservation
    995 ossl_bsaes_cbc_encrypt:
    996 	AARCH64_VALID_CALL_TARGET
    997 	cmp	x2, #128
    998 	bhs	.Lcbc_do_bsaes
    999 	b	AES_cbc_encrypt
   1000 .Lcbc_do_bsaes:
   1001 
   1002         // it is up to the caller to make sure we are called with enc == 0
   1003 
   1004 	stp	x29, x30, [sp, #-48]!
   1005 	stp	d8, d9, [sp, #16]
   1006 	stp	d10, d15, [sp, #32]
   1007 	lsr	x2, x2, #4                  // len in 16 byte blocks
   1008 
   1009 	ldr	w15, [x3, #240]             // get # of rounds
   1010 	mov	x14, sp
   1011 
   1012         // allocate the key schedule on the stack
   1013 	add	x17, sp, #96
   1014 	sub	x17, x17, x15, lsl #7       // 128 bytes per inner round key, less 96 bytes
   1015 
   1016         // populate the key schedule
   1017 	mov	x9, x3                      // pass key
   1018 	mov	x10, x15                    // pass # of rounds
   1019 	mov	sp, x17                     // sp is sp
   1020 	bl	_bsaes_key_convert
   1021 	ldr	q6,  [sp]
   1022 	str	q15, [x17]                  // save last round key
   1023 	eor	v6.16b, v6.16b, v7.16b      // fix up round 0 key (by XORing with 0x63)
   1024 	str	q6, [sp]
   1025 
   1026 	ldr	q15, [x4]                   // load IV
   1027 	b	.Lcbc_dec_loop
   1028 
   1029 .align	4
   1030 .Lcbc_dec_loop:
   1031 	subs	x2, x2, #0x8
   1032 	bmi	.Lcbc_dec_loop_finish
   1033 
   1034 	ldr	q0, [x0], #16               // load input
   1035 	mov	x9, sp                      // pass the key
   1036 	ldr	q1, [x0], #16
   1037 	mov	x10, x15
   1038 	ldr	q2, [x0], #16
   1039 	ldr	q3, [x0], #16
   1040 	ldr	q4, [x0], #16
   1041 	ldr	q5, [x0], #16
   1042 	ldr	q6, [x0], #16
   1043 	ldr	q7, [x0], #-7*16
   1044 
   1045 	bl	_bsaes_decrypt8
   1046 
   1047 	ldr	q16, [x0], #16              // reload input
   1048 	eor	v0.16b, v0.16b, v15.16b     // ^= IV
   1049 	eor	v1.16b, v1.16b, v16.16b
   1050 	str	q0, [x1], #16               // write output
   1051 	ldr	q0, [x0], #16
   1052 	str	q1, [x1], #16
   1053 	ldr	q1, [x0], #16
   1054 	eor	v1.16b, v4.16b, v1.16b
   1055 	ldr	q4, [x0], #16
   1056 	eor	v2.16b, v2.16b, v4.16b
   1057 	eor	v0.16b, v6.16b, v0.16b
   1058 	ldr	q4, [x0], #16
   1059 	str	q0, [x1], #16
   1060 	str	q1, [x1], #16
   1061 	eor	v0.16b, v7.16b, v4.16b
   1062 	ldr	q1, [x0], #16
   1063 	str	q2, [x1], #16
   1064 	ldr	q2, [x0], #16
   1065 	ldr	q15, [x0], #16
   1066 	str	q0, [x1], #16
   1067 	eor	v0.16b, v5.16b, v2.16b
   1068 	eor	v1.16b, v3.16b, v1.16b
   1069 	str	q1, [x1], #16
   1070 	str	q0, [x1], #16
   1071 
   1072 	b	.Lcbc_dec_loop
   1073 
   1074 .Lcbc_dec_loop_finish:
   1075 	adds	x2, x2, #8
   1076 	beq	.Lcbc_dec_done
   1077 
   1078 	ldr	q0, [x0], #16               // load input
   1079 	cmp	x2, #2
   1080 	blo	.Lcbc_dec_one
   1081 	ldr	q1, [x0], #16
   1082 	mov	x9, sp                      // pass the key
   1083 	mov	x10, x15
   1084 	beq	.Lcbc_dec_two
   1085 	ldr	q2, [x0], #16
   1086 	cmp	x2, #4
   1087 	blo	.Lcbc_dec_three
   1088 	ldr	q3, [x0], #16
   1089 	beq	.Lcbc_dec_four
   1090 	ldr	q4, [x0], #16
   1091 	cmp	x2, #6
   1092 	blo	.Lcbc_dec_five
   1093 	ldr	q5, [x0], #16
   1094 	beq	.Lcbc_dec_six
   1095 	ldr	q6, [x0], #-6*16
   1096 
   1097 	bl	_bsaes_decrypt8
   1098 
   1099 	ldr	q5, [x0], #16               // reload input
   1100 	eor	v0.16b, v0.16b, v15.16b     // ^= IV
   1101 	ldr	q8, [x0], #16
   1102 	ldr	q9, [x0], #16
   1103 	ldr	q10, [x0], #16
   1104 	str	q0, [x1], #16               // write output
   1105 	ldr	q0, [x0], #16
   1106 	eor	v1.16b, v1.16b, v5.16b
   1107 	ldr	q5, [x0], #16
   1108 	eor	v6.16b, v6.16b, v8.16b
   1109 	ldr	q15, [x0]
   1110 	eor	v4.16b, v4.16b, v9.16b
   1111 	eor	v2.16b, v2.16b, v10.16b
   1112 	str	q1, [x1], #16
   1113 	eor	v0.16b, v7.16b, v0.16b
   1114 	str	q6, [x1], #16
   1115 	eor	v1.16b, v3.16b, v5.16b
   1116 	str	q4, [x1], #16
   1117 	str	q2, [x1], #16
   1118 	str	q0, [x1], #16
   1119 	str	q1, [x1]
   1120 	b	.Lcbc_dec_done
   1121 .align	4
   1122 .Lcbc_dec_six:
   1123 	sub	x0, x0, #0x60
   1124 	bl	_bsaes_decrypt8
   1125 	ldr	q3, [x0], #16               // reload input
   1126 	eor	v0.16b, v0.16b, v15.16b     // ^= IV
   1127 	ldr	q5, [x0], #16
   1128 	ldr	q8, [x0], #16
   1129 	ldr	q9, [x0], #16
   1130 	str	q0, [x1], #16               // write output
   1131 	ldr	q0, [x0], #16
   1132 	eor	v1.16b, v1.16b, v3.16b
   1133 	ldr	q15, [x0]
   1134 	eor	v3.16b, v6.16b, v5.16b
   1135 	eor	v4.16b, v4.16b, v8.16b
   1136 	eor	v2.16b, v2.16b, v9.16b
   1137 	str	q1, [x1], #16
   1138 	eor	v0.16b, v7.16b, v0.16b
   1139 	str	q3, [x1], #16
   1140 	str	q4, [x1], #16
   1141 	str	q2, [x1], #16
   1142 	str	q0, [x1]
   1143 	b	.Lcbc_dec_done
   1144 .align	4
   1145 .Lcbc_dec_five:
   1146 	sub	x0, x0, #0x50
   1147 	bl	_bsaes_decrypt8
   1148 	ldr	q3, [x0], #16               // reload input
   1149 	eor	v0.16b, v0.16b, v15.16b     // ^= IV
   1150 	ldr	q5, [x0], #16
   1151 	ldr	q7, [x0], #16
   1152 	ldr	q8, [x0], #16
   1153 	str	q0, [x1], #16               // write output
   1154 	ldr	q15, [x0]
   1155 	eor	v0.16b, v1.16b, v3.16b
   1156 	eor	v1.16b, v6.16b, v5.16b
   1157 	eor	v3.16b, v4.16b, v7.16b
   1158 	str	q0, [x1], #16
   1159 	eor	v0.16b, v2.16b, v8.16b
   1160 	str	q1, [x1], #16
   1161 	str	q3, [x1], #16
   1162 	str	q0, [x1]
   1163 	b	.Lcbc_dec_done
   1164 .align	4
   1165 .Lcbc_dec_four:
   1166 	sub	x0, x0, #0x40
   1167 	bl	_bsaes_decrypt8
   1168 	ldr	q2, [x0], #16               // reload input
   1169 	eor	v0.16b, v0.16b, v15.16b     // ^= IV
   1170 	ldr	q3, [x0], #16
   1171 	ldr	q5, [x0], #16
   1172 	str	q0, [x1], #16               // write output
   1173 	ldr	q15, [x0]
   1174 	eor	v0.16b, v1.16b, v2.16b
   1175 	eor	v1.16b, v6.16b, v3.16b
   1176 	eor	v2.16b, v4.16b, v5.16b
   1177 	str	q0, [x1], #16
   1178 	str	q1, [x1], #16
   1179 	str	q2, [x1]
   1180 	b	.Lcbc_dec_done
   1181 .align	4
   1182 .Lcbc_dec_three:
   1183 	sub	x0, x0, #0x30
   1184 	bl	_bsaes_decrypt8
   1185 	ldr	q2, [x0], #16               // reload input
   1186 	eor	v0.16b, v0.16b, v15.16b     // ^= IV
   1187 	ldr	q3, [x0], #16
   1188 	ldr	q15, [x0]
   1189 	str	q0, [x1], #16               // write output
   1190 	eor	v0.16b, v1.16b, v2.16b
   1191 	eor	v1.16b, v6.16b, v3.16b
   1192 	str	q0, [x1], #16
   1193 	str	q1, [x1]
   1194 	b	.Lcbc_dec_done
   1195 .align	4
   1196 .Lcbc_dec_two:
   1197 	sub	x0, x0, #0x20
   1198 	bl	_bsaes_decrypt8
   1199 	ldr	q2, [x0], #16               // reload input
   1200 	eor	v0.16b, v0.16b, v15.16b     // ^= IV
   1201 	ldr	q15, [x0]
   1202 	str	q0, [x1], #16               // write output
   1203 	eor	v0.16b, v1.16b, v2.16b
   1204 	str	q0, [x1]
   1205 	b	.Lcbc_dec_done
   1206 .align	4
   1207 .Lcbc_dec_one:
   1208 	sub	x0, x0, #0x10
   1209 	stp	x1, x4, [sp, #-32]!
   1210 	str	x14, [sp, #16]
   1211 	mov	v8.16b, v15.16b
   1212 	mov	v15.16b, v0.16b
   1213 	mov	x2, x3
   1214 	bl	AES_decrypt
   1215 	ldr	x14, [sp, #16]
   1216 	ldp	x1, x4, [sp], #32
   1217 	ldr	q0, [x1]                    // load result
   1218 	eor	v0.16b, v0.16b, v8.16b      // ^= IV
   1219 	str	q0, [x1]                    // write output
   1220 
   1221 .align	4
   1222 .Lcbc_dec_done:
   1223 	movi	v0.16b, #0
   1224 	movi	v1.16b, #0
   1225 .Lcbc_dec_bzero:	//	wipe key schedule [if any]
   1226 	stp	q0, q1, [sp], #32
   1227 	cmp	sp, x14
   1228 	bne	.Lcbc_dec_bzero
   1229 	str	q15, [x4]                   // return IV
   1230 	ldp	d8, d9, [sp, #16]
   1231 	ldp	d10, d15, [sp, #32]
   1232 	ldp	x29, x30, [sp], #48
   1233 	ret
   1234 .size	ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt
   1235 
   1236 .globl	ossl_bsaes_ctr32_encrypt_blocks
   1237 .type	ossl_bsaes_ctr32_encrypt_blocks,%function
   1238 .align	4
   1239 // On entry:
   1240 //   x0 -> input text (whole 16-byte blocks)
   1241 //   x1 -> output text (whole 16-byte blocks)
   1242 //   x2 = number of 16-byte blocks to encrypt/decrypt (> 0)
   1243 //   x3 -> key
   1244 //   x4 -> initial value of 128-bit counter (stored big-endian) which increments, modulo 2^32, for each block
   1245 // On exit:
   1246 //   Output text filled in
   1247 //   No output registers, usual AAPCS64 register preservation
   1248 ossl_bsaes_ctr32_encrypt_blocks:
   1249 	AARCH64_VALID_CALL_TARGET
   1250 	cmp	x2, #8                      // use plain AES for
   1251 	blo	.Lctr_enc_short             // small sizes
   1252 
   1253 	stp	x29, x30, [sp, #-80]!
   1254 	stp	d8, d9, [sp, #16]
   1255 	stp	d10, d11, [sp, #32]
   1256 	stp	d12, d13, [sp, #48]
   1257 	stp	d14, d15, [sp, #64]
   1258 
   1259 	ldr	w15, [x3, #240]             // get # of rounds
   1260 	mov	x14, sp
   1261 
   1262         // allocate the key schedule on the stack
   1263 	add	x17, sp, #96
   1264 	sub	x17, x17, x15, lsl #7       // 128 bytes per inner round key, less 96 bytes
   1265 
   1266         // populate the key schedule
   1267 	mov	x9, x3                      // pass key
   1268 	mov	x10, x15                    // pass # of rounds
   1269 	mov	sp, x17                     // sp is sp
   1270 	bl	_bsaes_key_convert
   1271 	eor	v7.16b, v7.16b, v15.16b     // fix up last round key
   1272 	str	q7, [x17]                   // save last round key
   1273 
   1274 	ldr	q0, [x4]                    // load counter
   1275 	add	x13, x11, #.LREVM0SR-.LM0_bigendian
   1276 	ldr	q4, [sp]                    // load round0 key
   1277 
   1278 	movi	v8.4s, #1                   // compose 1<<96
   1279 	movi	v9.16b, #0
   1280 	rev32	v15.16b, v0.16b
   1281 	rev32	v0.16b, v0.16b
   1282 	ext	v11.16b, v9.16b, v8.16b, #4
   1283 	rev32	v4.16b, v4.16b
   1284 	add	v12.4s, v11.4s, v11.4s      // compose 2<<96
   1285 	str	q4, [sp]                    // save adjusted round0 key
   1286 	add	v13.4s, v11.4s, v12.4s      // compose 3<<96
   1287 	add	v14.4s, v12.4s, v12.4s      // compose 4<<96
   1288 	b	.Lctr_enc_loop
   1289 
   1290 .align	4
   1291 .Lctr_enc_loop:
   1292         // Intermix prologue from _bsaes_encrypt8 to use the opportunity
   1293         // to flip byte order in 32-bit counter
   1294 
   1295 	add	v1.4s, v15.4s, v11.4s       // +1
   1296 	add	x9, sp, #0x10               // pass next round key
   1297 	add	v2.4s, v15.4s, v12.4s       // +2
   1298 	ldr	q9, [x13]                   // .LREVM0SR
   1299 	ldr	q8, [sp]                    // load round0 key
   1300 	add	v3.4s, v15.4s, v13.4s       // +3
   1301 	mov	x10, x15                    // pass rounds
   1302 	sub	x11, x13, #.LREVM0SR-.LSR   // pass constants
   1303 	add	v6.4s, v2.4s, v14.4s
   1304 	add	v4.4s, v15.4s, v14.4s       // +4
   1305 	add	v7.4s, v3.4s, v14.4s
   1306 	add	v15.4s, v4.4s, v14.4s       // next counter
   1307 	add	v5.4s, v1.4s, v14.4s
   1308 
   1309 	bl	_bsaes_encrypt8_alt
   1310 
   1311 	subs	x2, x2, #8
   1312 	blo	.Lctr_enc_loop_done
   1313 
   1314 	ldr	q16, [x0], #16
   1315 	ldr	q17, [x0], #16
   1316 	eor	v1.16b, v1.16b, v17.16b
   1317 	ldr	q17, [x0], #16
   1318 	eor	v0.16b, v0.16b, v16.16b
   1319 	eor	v4.16b, v4.16b, v17.16b
   1320 	str	q0, [x1], #16
   1321 	ldr	q16, [x0], #16
   1322 	str	q1, [x1], #16
   1323 	mov	v0.16b, v15.16b
   1324 	str	q4, [x1], #16
   1325 	ldr	q1, [x0], #16
   1326 	eor	v4.16b, v6.16b, v16.16b
   1327 	eor	v1.16b, v3.16b, v1.16b
   1328 	ldr	q3, [x0], #16
   1329 	eor	v3.16b, v7.16b, v3.16b
   1330 	ldr	q6, [x0], #16
   1331 	eor	v2.16b, v2.16b, v6.16b
   1332 	ldr	q6, [x0], #16
   1333 	eor	v5.16b, v5.16b, v6.16b
   1334 	str	q4, [x1], #16
   1335 	str	q1, [x1], #16
   1336 	str	q3, [x1], #16
   1337 	str	q2, [x1], #16
   1338 	str	q5, [x1], #16
   1339 
   1340 	bne	.Lctr_enc_loop
   1341 	b	.Lctr_enc_done
   1342 
   1343 .align	4
   1344 .Lctr_enc_loop_done:
   1345 	add	x2, x2, #8
   1346 	ldr	q16, [x0], #16              // load input
   1347 	eor	v0.16b, v0.16b, v16.16b
   1348 	str	q0, [x1], #16               // write output
   1349 	cmp	x2, #2
   1350 	blo	.Lctr_enc_done
   1351 	ldr	q17, [x0], #16
   1352 	eor	v1.16b, v1.16b, v17.16b
   1353 	str	q1, [x1], #16
   1354 	beq	.Lctr_enc_done
   1355 	ldr	q18, [x0], #16
   1356 	eor	v4.16b, v4.16b, v18.16b
   1357 	str	q4, [x1], #16
   1358 	cmp	x2, #4
   1359 	blo	.Lctr_enc_done
   1360 	ldr	q19, [x0], #16
   1361 	eor	v6.16b, v6.16b, v19.16b
   1362 	str	q6, [x1], #16
   1363 	beq	.Lctr_enc_done
   1364 	ldr	q20, [x0], #16
   1365 	eor	v3.16b, v3.16b, v20.16b
   1366 	str	q3, [x1], #16
   1367 	cmp	x2, #6
   1368 	blo	.Lctr_enc_done
   1369 	ldr	q21, [x0], #16
   1370 	eor	v7.16b, v7.16b, v21.16b
   1371 	str	q7, [x1], #16
   1372 	beq	.Lctr_enc_done
   1373 	ldr	q22, [x0]
   1374 	eor	v2.16b, v2.16b, v22.16b
   1375 	str	q2, [x1], #16
   1376 
   1377 .Lctr_enc_done:
   1378 	movi	v0.16b, #0
   1379 	movi	v1.16b, #0
   1380 .Lctr_enc_bzero:	//	wipe key schedule [if any]
   1381 	stp	q0, q1, [sp], #32
   1382 	cmp	sp, x14
   1383 	bne	.Lctr_enc_bzero
   1384 
   1385 	ldp	d8, d9, [sp, #16]
   1386 	ldp	d10, d11, [sp, #32]
   1387 	ldp	d12, d13, [sp, #48]
   1388 	ldp	d14, d15, [sp, #64]
   1389 	ldp	x29, x30, [sp], #80
   1390 	ret
   1391 
   1392 .Lctr_enc_short:
   1393 	stp	x29, x30, [sp, #-96]!
   1394 	stp	x19, x20, [sp, #16]
   1395 	stp	x21, x22, [sp, #32]
   1396 	str	x23, [sp, #48]
   1397 
   1398 	mov	x19, x0                     // copy arguments
   1399 	mov	x20, x1
   1400 	mov	x21, x2
   1401 	mov	x22, x3
   1402 	ldr	w23, [x4, #12]              // load counter .LSW
   1403 	ldr	q1, [x4]                    // load whole counter value
   1404 #ifdef __AARCH64EL__
   1405 	rev	w23, w23
   1406 #endif
   1407 	str	q1, [sp, #80]               // copy counter value
   1408 
   1409 .Lctr_enc_short_loop:
   1410 	add	x0, sp, #80                 // input counter value
   1411 	add	x1, sp, #64                 // output on the stack
   1412 	mov	x2, x22                     // key
   1413 
   1414 	bl	AES_encrypt
   1415 
   1416 	ldr	q0, [x19], #16              // load input
   1417 	ldr	q1, [sp, #64]               // load encrypted counter
   1418 	add	x23, x23, #1
   1419 #ifdef __AARCH64EL__
   1420 	rev	w0, w23
   1421 	str	w0, [sp, #80+12]            // next counter value
   1422 #else
   1423 	str	w23, [sp, #80+12]           // next counter value
   1424 #endif
   1425 	eor	v0.16b, v0.16b, v1.16b
   1426 	str	q0, [x20], #16              // store output
   1427 	subs	x21, x21, #1
   1428 	bne	.Lctr_enc_short_loop
   1429 
   1430 	movi	v0.16b, #0
   1431 	movi	v1.16b, #0
   1432 	stp	q0, q1, [sp, #64]
   1433 
   1434 	ldr	x23, [sp, #48]
   1435 	ldp	x21, x22, [sp, #32]
   1436 	ldp	x19, x20, [sp, #16]
   1437 	ldp	x29, x30, [sp], #96
   1438 	ret
   1439 .size	ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks
   1440 
   1441 .globl	ossl_bsaes_xts_encrypt
   1442 .type	ossl_bsaes_xts_encrypt,%function
   1443 .align	4
   1444 // On entry:
   1445 //   x0 -> input plaintext
   1446 //   x1 -> output ciphertext
   1447 //   x2 -> length of text in bytes (must be at least 16)
   1448 //   x3 -> key1 (used to encrypt the XORed plaintext blocks)
   1449 //   x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
   1450 //   x5 -> 16-byte initial vector (typically, sector number)
   1451 // On exit:
   1452 //   Output ciphertext filled in
   1453 //   No output registers, usual AAPCS64 register preservation
   1454 ossl_bsaes_xts_encrypt:
   1455 	AARCH64_VALID_CALL_TARGET
   1456         // Stack layout:
   1457         // sp ->
   1458         //        nrounds*128-96 bytes: key schedule
   1459         // x19 ->
   1460         //        16 bytes: frame record
   1461         //        4*16 bytes: tweak storage across _bsaes_encrypt8
   1462         //        6*8 bytes: storage for 5 callee-saved general-purpose registers
   1463         //        8*8 bytes: storage for 8 callee-saved SIMD registers
   1464 	stp	x29, x30, [sp, #-192]!
   1465 	stp	x19, x20, [sp, #80]
   1466 	stp	x21, x22, [sp, #96]
   1467 	str	x23, [sp, #112]
   1468 	stp	d8, d9, [sp, #128]
   1469 	stp	d10, d11, [sp, #144]
   1470 	stp	d12, d13, [sp, #160]
   1471 	stp	d14, d15, [sp, #176]
   1472 
   1473 	mov	x19, sp
   1474 	mov	x20, x0
   1475 	mov	x21, x1
   1476 	mov	x22, x2
   1477 	mov	x23, x3
   1478 
   1479         // generate initial tweak
   1480 	sub	sp, sp, #16
   1481 	mov	x0, x5                      // iv[]
   1482 	mov	x1, sp
   1483 	mov	x2, x4                      // key2
   1484 	bl	AES_encrypt
   1485 	ldr	q11, [sp], #16
   1486 
   1487 	ldr	w1, [x23, #240]             // get # of rounds
   1488         // allocate the key schedule on the stack
   1489 	add	x17, sp, #96
   1490 	sub	x17, x17, x1, lsl #7        // 128 bytes per inner round key, less 96 bytes
   1491 
   1492         // populate the key schedule
   1493 	mov	x9, x23                     // pass key
   1494 	mov	x10, x1                     // pass # of rounds
   1495 	mov	sp, x17
   1496 	bl	_bsaes_key_convert
   1497 	eor	v15.16b, v15.16b, v7.16b    // fix up last round key
   1498 	str	q15, [x17]                  // save last round key
   1499 
   1500 	subs	x22, x22, #0x80
   1501 	blo	.Lxts_enc_short
   1502 	b	.Lxts_enc_loop
   1503 
   1504 .align	4
   1505 .Lxts_enc_loop:
   1506 	ldr	q8, .Lxts_magic
   1507 	mov	x10, x1                     // pass rounds
   1508 	add	x2, x19, #16
   1509 	ldr	q0, [x20], #16
   1510 	sshr	v1.2d, v11.2d, #63
   1511 	mov	x9, sp                      // pass key schedule
   1512 	ldr	q6, .Lxts_magic+16
   1513 	add	v2.2d, v11.2d, v11.2d
   1514 	cmtst	v3.2d, v11.2d, v6.2d
   1515 	and	v1.16b, v1.16b, v8.16b
   1516 	ext	v1.16b, v1.16b, v1.16b, #8
   1517 	and	v3.16b, v3.16b, v8.16b
   1518 	ldr	q4, [x20], #16
   1519 	eor	v12.16b, v2.16b, v1.16b
   1520 	eor	v1.16b, v4.16b, v12.16b
   1521 	eor	v0.16b, v0.16b, v11.16b
   1522 	cmtst	v2.2d, v12.2d, v6.2d
   1523 	add	v4.2d, v12.2d, v12.2d
   1524 	add	x0, x19, #16
   1525 	ext	v3.16b, v3.16b, v3.16b, #8
   1526 	and	v2.16b, v2.16b, v8.16b
   1527 	eor	v13.16b, v4.16b, v3.16b
   1528 	ldr	q3, [x20], #16
   1529 	ext	v4.16b, v2.16b, v2.16b, #8
   1530 	eor	v2.16b, v3.16b, v13.16b
   1531 	ldr	q3, [x20], #16
   1532 	add	v5.2d, v13.2d, v13.2d
   1533 	cmtst	v7.2d, v13.2d, v6.2d
   1534 	and	v7.16b, v7.16b, v8.16b
   1535 	ldr	q9, [x20], #16
   1536 	ext	v7.16b, v7.16b, v7.16b, #8
   1537 	ldr	q10, [x20], #16
   1538 	eor	v14.16b, v5.16b, v4.16b
   1539 	ldr	q16, [x20], #16
   1540 	add	v4.2d, v14.2d, v14.2d
   1541 	eor	v3.16b, v3.16b, v14.16b
   1542 	eor	v15.16b, v4.16b, v7.16b
   1543 	add	v5.2d, v15.2d, v15.2d
   1544 	ldr	q7, [x20], #16
   1545 	cmtst	v4.2d, v14.2d, v6.2d
   1546 	and	v17.16b, v4.16b, v8.16b
   1547 	cmtst	v18.2d, v15.2d, v6.2d
   1548 	eor	v4.16b, v9.16b, v15.16b
   1549 	ext	v9.16b, v17.16b, v17.16b, #8
   1550 	eor	v9.16b, v5.16b, v9.16b
   1551 	add	v17.2d, v9.2d, v9.2d
   1552 	and	v18.16b, v18.16b, v8.16b
   1553 	eor	v5.16b, v10.16b, v9.16b
   1554 	str	q9, [x2], #16
   1555 	ext	v10.16b, v18.16b, v18.16b, #8
   1556 	cmtst	v9.2d, v9.2d, v6.2d
   1557 	and	v9.16b, v9.16b, v8.16b
   1558 	eor	v10.16b, v17.16b, v10.16b
   1559 	cmtst	v17.2d, v10.2d, v6.2d
   1560 	eor	v6.16b, v16.16b, v10.16b
   1561 	str	q10, [x2], #16
   1562 	ext	v9.16b, v9.16b, v9.16b, #8
   1563 	add	v10.2d, v10.2d, v10.2d
   1564 	eor	v9.16b, v10.16b, v9.16b
   1565 	str	q9, [x2], #16
   1566 	eor	v7.16b, v7.16b, v9.16b
   1567 	add	v9.2d, v9.2d, v9.2d
   1568 	and	v8.16b, v17.16b, v8.16b
   1569 	ext	v8.16b, v8.16b, v8.16b, #8
   1570 	eor	v8.16b, v9.16b, v8.16b
   1571 	str	q8, [x2]                    // next round tweak
   1572 
   1573 	bl	_bsaes_encrypt8
   1574 
   1575 	ldr	q8, [x0], #16
   1576 	eor	v0.16b, v0.16b, v11.16b
   1577 	eor	v1.16b, v1.16b, v12.16b
   1578 	ldr	q9, [x0], #16
   1579 	eor	v4.16b, v4.16b, v13.16b
   1580 	eor	v6.16b, v6.16b, v14.16b
   1581 	ldr	q10, [x0], #16
   1582 	eor	v3.16b, v3.16b, v15.16b
   1583 	subs	x22, x22, #0x80
   1584 	str	q0, [x21], #16
   1585 	ldr	q11, [x0]                   // next round tweak
   1586 	str	q1, [x21], #16
   1587 	eor	v0.16b, v7.16b, v8.16b
   1588 	eor	v1.16b, v2.16b, v9.16b
   1589 	str	q4, [x21], #16
   1590 	eor	v2.16b, v5.16b, v10.16b
   1591 	str	q6, [x21], #16
   1592 	str	q3, [x21], #16
   1593 	str	q0, [x21], #16
   1594 	str	q1, [x21], #16
   1595 	str	q2, [x21], #16
   1596 	bpl	.Lxts_enc_loop
   1597 
   1598 .Lxts_enc_short:
   1599 	adds	x22, x22, #0x70
   1600 	bmi	.Lxts_enc_done
   1601 
   1602 	ldr	q8, .Lxts_magic
   1603 	sshr	v1.2d, v11.2d, #63
   1604 	add	v2.2d, v11.2d, v11.2d
   1605 	ldr	q9, .Lxts_magic+16
   1606 	subs	x22, x22, #0x10
   1607 	ldr	q0, [x20], #16
   1608 	and	v1.16b, v1.16b, v8.16b
   1609 	cmtst	v3.2d, v11.2d, v9.2d
   1610 	ext	v1.16b, v1.16b, v1.16b, #8
   1611 	and	v3.16b, v3.16b, v8.16b
   1612 	eor	v12.16b, v2.16b, v1.16b
   1613 	ext	v1.16b, v3.16b, v3.16b, #8
   1614 	add	v2.2d, v12.2d, v12.2d
   1615 	cmtst	v3.2d, v12.2d, v9.2d
   1616 	eor	v13.16b, v2.16b, v1.16b
   1617 	and	v22.16b, v3.16b, v8.16b
   1618 	bmi	.Lxts_enc_1
   1619 
   1620 	ext	v2.16b, v22.16b, v22.16b, #8
   1621 	add	v3.2d, v13.2d, v13.2d
   1622 	ldr	q1, [x20], #16
   1623 	cmtst	v4.2d, v13.2d, v9.2d
   1624 	subs	x22, x22, #0x10
   1625 	eor	v14.16b, v3.16b, v2.16b
   1626 	and	v23.16b, v4.16b, v8.16b
   1627 	bmi	.Lxts_enc_2
   1628 
   1629 	ext	v3.16b, v23.16b, v23.16b, #8
   1630 	add	v4.2d, v14.2d, v14.2d
   1631 	ldr	q2, [x20], #16
   1632 	cmtst	v5.2d, v14.2d, v9.2d
   1633 	eor	v0.16b, v0.16b, v11.16b
   1634 	subs	x22, x22, #0x10
   1635 	eor	v15.16b, v4.16b, v3.16b
   1636 	and	v24.16b, v5.16b, v8.16b
   1637 	bmi	.Lxts_enc_3
   1638 
   1639 	ext	v4.16b, v24.16b, v24.16b, #8
   1640 	add	v5.2d, v15.2d, v15.2d
   1641 	ldr	q3, [x20], #16
   1642 	cmtst	v6.2d, v15.2d, v9.2d
   1643 	eor	v1.16b, v1.16b, v12.16b
   1644 	subs	x22, x22, #0x10
   1645 	eor	v16.16b, v5.16b, v4.16b
   1646 	and	v25.16b, v6.16b, v8.16b
   1647 	bmi	.Lxts_enc_4
   1648 
   1649 	ext	v5.16b, v25.16b, v25.16b, #8
   1650 	add	v6.2d, v16.2d, v16.2d
   1651 	add	x0, x19, #16
   1652 	cmtst	v7.2d, v16.2d, v9.2d
   1653 	ldr	q4, [x20], #16
   1654 	eor	v2.16b, v2.16b, v13.16b
   1655 	str	q16, [x0], #16
   1656 	subs	x22, x22, #0x10
   1657 	eor	v17.16b, v6.16b, v5.16b
   1658 	and	v26.16b, v7.16b, v8.16b
   1659 	bmi	.Lxts_enc_5
   1660 
   1661 	ext	v7.16b, v26.16b, v26.16b, #8
   1662 	add	v18.2d, v17.2d, v17.2d
   1663 	ldr	q5, [x20], #16
   1664 	eor	v3.16b, v3.16b, v14.16b
   1665 	str	q17, [x0], #16
   1666 	subs	x22, x22, #0x10
   1667 	eor	v18.16b, v18.16b, v7.16b
   1668 	bmi	.Lxts_enc_6
   1669 
   1670 	ldr	q6, [x20], #16
   1671 	eor	v4.16b, v4.16b, v15.16b
   1672 	eor	v5.16b, v5.16b, v16.16b
   1673 	str	q18, [x0]                   // next round tweak
   1674 	mov	x9, sp                      // pass key schedule
   1675 	mov	x10, x1
   1676 	add	x0, x19, #16
   1677 	sub	x22, x22, #0x10
   1678 	eor	v6.16b, v6.16b, v17.16b
   1679 
   1680 	bl	_bsaes_encrypt8
   1681 
   1682 	ldr	q16, [x0], #16
   1683 	eor	v0.16b, v0.16b, v11.16b
   1684 	eor	v1.16b, v1.16b, v12.16b
   1685 	ldr	q17, [x0], #16
   1686 	eor	v4.16b, v4.16b, v13.16b
   1687 	eor	v6.16b, v6.16b, v14.16b
   1688 	eor	v3.16b, v3.16b, v15.16b
   1689 	ldr	q11, [x0]                   // next round tweak
   1690 	str	q0, [x21], #16
   1691 	str	q1, [x21], #16
   1692 	eor	v0.16b, v7.16b, v16.16b
   1693 	eor	v1.16b, v2.16b, v17.16b
   1694 	str	q4, [x21], #16
   1695 	str	q6, [x21], #16
   1696 	str	q3, [x21], #16
   1697 	str	q0, [x21], #16
   1698 	str	q1, [x21], #16
   1699 	b	.Lxts_enc_done
   1700 
   1701 .align	4
   1702 .Lxts_enc_6:
   1703 	eor	v4.16b, v4.16b, v15.16b
   1704 	eor	v5.16b, v5.16b, v16.16b
   1705 	mov	x9, sp                      // pass key schedule
   1706 	mov	x10, x1                     // pass rounds
   1707 	add	x0, x19, #16
   1708 
   1709 	bl	_bsaes_encrypt8
   1710 
   1711 	ldr	q16, [x0], #16
   1712 	eor	v0.16b, v0.16b, v11.16b
   1713 	eor	v1.16b, v1.16b, v12.16b
   1714 	eor	v4.16b, v4.16b, v13.16b
   1715 	eor	v6.16b, v6.16b, v14.16b
   1716 	ldr	q11, [x0]                   // next round tweak
   1717 	eor	v3.16b, v3.16b, v15.16b
   1718 	str	q0, [x21], #16
   1719 	str	q1, [x21], #16
   1720 	eor	v0.16b, v7.16b, v16.16b
   1721 	str	q4, [x21], #16
   1722 	str	q6, [x21], #16
   1723 	str	q3, [x21], #16
   1724 	str	q0, [x21], #16
   1725 	b	.Lxts_enc_done
   1726 
   1727 .align	4
   1728 .Lxts_enc_5:
   1729 	eor	v3.16b, v3.16b, v14.16b
   1730 	eor	v4.16b, v4.16b, v15.16b
   1731 	mov	x9, sp                      // pass key schedule
   1732 	mov	x10, x1                     // pass rounds
   1733 	add	x0, x19, #16
   1734 
   1735 	bl	_bsaes_encrypt8
   1736 
   1737 	eor	v0.16b, v0.16b, v11.16b
   1738 	eor	v1.16b, v1.16b, v12.16b
   1739 	ldr	q11, [x0]                   // next round tweak
   1740 	eor	v4.16b, v4.16b, v13.16b
   1741 	eor	v6.16b, v6.16b, v14.16b
   1742 	eor	v3.16b, v3.16b, v15.16b
   1743 	str	q0, [x21], #16
   1744 	str	q1, [x21], #16
   1745 	str	q4, [x21], #16
   1746 	str	q6, [x21], #16
   1747 	str	q3, [x21], #16
   1748 	b	.Lxts_enc_done
   1749 
   1750 .align	4
   1751 .Lxts_enc_4:
   1752 	eor	v2.16b, v2.16b, v13.16b
   1753 	eor	v3.16b, v3.16b, v14.16b
   1754 	mov	x9, sp                      // pass key schedule
   1755 	mov	x10, x1                     // pass rounds
   1756 	add	x0, x19, #16
   1757 
   1758 	bl	_bsaes_encrypt8
   1759 
   1760 	eor	v0.16b, v0.16b, v11.16b
   1761 	eor	v1.16b, v1.16b, v12.16b
   1762 	eor	v4.16b, v4.16b, v13.16b
   1763 	eor	v6.16b, v6.16b, v14.16b
   1764 	mov	v11.16b, v15.16b            // next round tweak
   1765 	str	q0, [x21], #16
   1766 	str	q1, [x21], #16
   1767 	str	q4, [x21], #16
   1768 	str	q6, [x21], #16
   1769 	b	.Lxts_enc_done
   1770 
   1771 .align	4
   1772 .Lxts_enc_3:
   1773 	eor	v1.16b, v1.16b, v12.16b
   1774 	eor	v2.16b, v2.16b, v13.16b
   1775 	mov	x9, sp                      // pass key schedule
   1776 	mov	x10, x1                     // pass rounds
   1777 	add	x0, x19, #16
   1778 
   1779 	bl	_bsaes_encrypt8
   1780 
   1781 	eor	v0.16b, v0.16b, v11.16b
   1782 	eor	v1.16b, v1.16b, v12.16b
   1783 	eor	v4.16b, v4.16b, v13.16b
   1784 	mov	v11.16b, v14.16b            // next round tweak
   1785 	str	q0, [x21], #16
   1786 	str	q1, [x21], #16
   1787 	str	q4, [x21], #16
   1788 	b	.Lxts_enc_done
   1789 
   1790 .align	4
   1791 .Lxts_enc_2:
   1792 	eor	v0.16b, v0.16b, v11.16b
   1793 	eor	v1.16b, v1.16b, v12.16b
   1794 	mov	x9, sp                      // pass key schedule
   1795 	mov	x10, x1                     // pass rounds
   1796 	add	x0, x19, #16
   1797 
   1798 	bl	_bsaes_encrypt8
   1799 
   1800 	eor	v0.16b, v0.16b, v11.16b
   1801 	eor	v1.16b, v1.16b, v12.16b
   1802 	mov	v11.16b, v13.16b            // next round tweak
   1803 	str	q0, [x21], #16
   1804 	str	q1, [x21], #16
   1805 	b	.Lxts_enc_done
   1806 
   1807 .align	4
   1808 .Lxts_enc_1:
   1809 	eor	v0.16b, v0.16b, v11.16b
   1810 	sub	x0, sp, #16
   1811 	sub	x1, sp, #16
   1812 	mov	x2, x23
   1813 	mov	v13.d[0], v11.d[1]          // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
   1814 	mov	v14.d[0], v12.d[1]
   1815 	str	q0, [sp, #-16]!
   1816 
   1817 	bl	AES_encrypt
   1818 
   1819 	ldr	q0, [sp], #16
   1820 	trn1	v13.2d, v11.2d, v13.2d
   1821 	trn1	v11.2d, v12.2d, v14.2d      // next round tweak
   1822 	eor	v0.16b, v0.16b, v13.16b
   1823 	str	q0, [x21], #16
   1824 
   1825 .Lxts_enc_done:
   1826 	adds	x22, x22, #0x10
   1827 	beq	.Lxts_enc_ret
   1828 
   1829 	sub	x6, x21, #0x10
   1830         // Penultimate plaintext block produces final ciphertext part-block
   1831         // plus remaining part of final plaintext block. Move ciphertext part
   1832         // to final position and reuse penultimate ciphertext block buffer to
   1833         // construct final plaintext block
   1834 .Lxts_enc_steal:
   1835 	ldrb	w0, [x20], #1
   1836 	ldrb	w1, [x21, #-0x10]
   1837 	strb	w0, [x21, #-0x10]
   1838 	strb	w1, [x21], #1
   1839 
   1840 	subs	x22, x22, #1
   1841 	bhi	.Lxts_enc_steal
   1842 
   1843         // Finally encrypt the penultimate ciphertext block using the
   1844         // last tweak
   1845 	ldr	q0, [x6]
   1846 	eor	v0.16b, v0.16b, v11.16b
   1847 	str	q0, [sp, #-16]!
   1848 	mov	x0, sp
   1849 	mov	x1, sp
   1850 	mov	x2, x23
   1851 	mov	x21, x6
   1852 	mov	v13.d[0], v11.d[1]          // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
   1853 
   1854 	bl	AES_encrypt
   1855 
   1856 	trn1	v11.2d, v11.2d, v13.2d
   1857 	ldr	q0, [sp], #16
   1858 	eor	v0.16b, v0.16b, v11.16b
   1859 	str	q0, [x21]
   1860 
   1861 .Lxts_enc_ret:
   1862 
   1863 	movi	v0.16b, #0
   1864 	movi	v1.16b, #0
   1865 .Lxts_enc_bzero:	//	wipe key schedule
   1866 	stp	q0, q1, [sp], #32
   1867 	cmp	sp, x19
   1868 	bne	.Lxts_enc_bzero
   1869 
   1870 	ldp	x19, x20, [sp, #80]
   1871 	ldp	x21, x22, [sp, #96]
   1872 	ldr	x23, [sp, #112]
   1873 	ldp	d8, d9, [sp, #128]
   1874 	ldp	d10, d11, [sp, #144]
   1875 	ldp	d12, d13, [sp, #160]
   1876 	ldp	d14, d15, [sp, #176]
   1877 	ldp	x29, x30, [sp], #192
   1878 	ret
   1879 .size	ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt
   1880 
   1881 // The assembler doesn't seem capable of de-duplicating these when expressed
   1882 // using `ldr qd,=` syntax, so assign a symbolic address
   1883 .align	5
   1884 .Lxts_magic:
   1885 .quad	1, 0x87, 0x4000000000000000, 0x4000000000000000
   1886 
   1887 .globl	ossl_bsaes_xts_decrypt
   1888 .type	ossl_bsaes_xts_decrypt,%function
   1889 .align	4
   1890 // On entry:
   1891 //   x0 -> input ciphertext
   1892 //   x1 -> output plaintext
   1893 //   x2 -> length of text in bytes (must be at least 16)
   1894 //   x3 -> key1 (used to decrypt the XORed ciphertext blocks)
   1895 //   x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
   1896 //   x5 -> 16-byte initial vector (typically, sector number)
   1897 // On exit:
   1898 //   Output plaintext filled in
   1899 //   No output registers, usual AAPCS64 register preservation
   1900 ossl_bsaes_xts_decrypt:
   1901 	AARCH64_VALID_CALL_TARGET
   1902         // Stack layout:
   1903         // sp ->
   1904         //        nrounds*128-96 bytes: key schedule
   1905         // x19 ->
   1906         //        16 bytes: frame record
   1907         //        4*16 bytes: tweak storage across _bsaes_decrypt8
   1908         //        6*8 bytes: storage for 5 callee-saved general-purpose registers
   1909         //        8*8 bytes: storage for 8 callee-saved SIMD registers
   1910 	stp	x29, x30, [sp, #-192]!
   1911 	stp	x19, x20, [sp, #80]
   1912 	stp	x21, x22, [sp, #96]
   1913 	str	x23, [sp, #112]
   1914 	stp	d8, d9, [sp, #128]
   1915 	stp	d10, d11, [sp, #144]
   1916 	stp	d12, d13, [sp, #160]
   1917 	stp	d14, d15, [sp, #176]
   1918 
   1919 	mov	x19, sp
   1920 	mov	x20, x0
   1921 	mov	x21, x1
   1922 	mov	x22, x2
   1923 	mov	x23, x3
   1924 
   1925         // generate initial tweak
   1926 	sub	sp, sp, #16
   1927 	mov	x0, x5                      // iv[]
   1928 	mov	x1, sp
   1929 	mov	x2, x4                      // key2
   1930 	bl	AES_encrypt
   1931 	ldr	q11, [sp], #16
   1932 
   1933 	ldr	w1, [x23, #240]             // get # of rounds
   1934         // allocate the key schedule on the stack
   1935 	add	x17, sp, #96
   1936 	sub	x17, x17, x1, lsl #7        // 128 bytes per inner round key, less 96 bytes
   1937 
   1938         // populate the key schedule
   1939 	mov	x9, x23                     // pass key
   1940 	mov	x10, x1                     // pass # of rounds
   1941 	mov	sp, x17
   1942 	bl	_bsaes_key_convert
   1943 	ldr	q6,  [sp]
   1944 	str	q15, [x17]                  // save last round key
   1945 	eor	v6.16b, v6.16b, v7.16b      // fix up round 0 key (by XORing with 0x63)
   1946 	str	q6, [sp]
   1947 
   1948 	sub	x30, x22, #0x10
   1949 	tst	x22, #0xf                   // if not multiple of 16
   1950 	csel	x22, x30, x22, ne           // subtract another 16 bytes
   1951 	subs	x22, x22, #0x80
   1952 
   1953 	blo	.Lxts_dec_short
   1954 	b	.Lxts_dec_loop
   1955 
   1956 .align	4
   1957 .Lxts_dec_loop:
   1958 	ldr	q8, .Lxts_magic
   1959 	mov	x10, x1                     // pass rounds
   1960 	add	x2, x19, #16
   1961 	ldr	q0, [x20], #16
   1962 	sshr	v1.2d, v11.2d, #63
   1963 	mov	x9, sp                      // pass key schedule
   1964 	ldr	q6, .Lxts_magic+16
   1965 	add	v2.2d, v11.2d, v11.2d
   1966 	cmtst	v3.2d, v11.2d, v6.2d
   1967 	and	v1.16b, v1.16b, v8.16b
   1968 	ext	v1.16b, v1.16b, v1.16b, #8
   1969 	and	v3.16b, v3.16b, v8.16b
   1970 	ldr	q4, [x20], #16
   1971 	eor	v12.16b, v2.16b, v1.16b
   1972 	eor	v1.16b, v4.16b, v12.16b
   1973 	eor	v0.16b, v0.16b, v11.16b
   1974 	cmtst	v2.2d, v12.2d, v6.2d
   1975 	add	v4.2d, v12.2d, v12.2d
   1976 	add	x0, x19, #16
   1977 	ext	v3.16b, v3.16b, v3.16b, #8
   1978 	and	v2.16b, v2.16b, v8.16b
   1979 	eor	v13.16b, v4.16b, v3.16b
   1980 	ldr	q3, [x20], #16
   1981 	ext	v4.16b, v2.16b, v2.16b, #8
   1982 	eor	v2.16b, v3.16b, v13.16b
   1983 	ldr	q3, [x20], #16
   1984 	add	v5.2d, v13.2d, v13.2d
   1985 	cmtst	v7.2d, v13.2d, v6.2d
   1986 	and	v7.16b, v7.16b, v8.16b
   1987 	ldr	q9, [x20], #16
   1988 	ext	v7.16b, v7.16b, v7.16b, #8
   1989 	ldr	q10, [x20], #16
   1990 	eor	v14.16b, v5.16b, v4.16b
   1991 	ldr	q16, [x20], #16
   1992 	add	v4.2d, v14.2d, v14.2d
   1993 	eor	v3.16b, v3.16b, v14.16b
   1994 	eor	v15.16b, v4.16b, v7.16b
   1995 	add	v5.2d, v15.2d, v15.2d
   1996 	ldr	q7, [x20], #16
   1997 	cmtst	v4.2d, v14.2d, v6.2d
   1998 	and	v17.16b, v4.16b, v8.16b
   1999 	cmtst	v18.2d, v15.2d, v6.2d
   2000 	eor	v4.16b, v9.16b, v15.16b
   2001 	ext	v9.16b, v17.16b, v17.16b, #8
   2002 	eor	v9.16b, v5.16b, v9.16b
   2003 	add	v17.2d, v9.2d, v9.2d
   2004 	and	v18.16b, v18.16b, v8.16b
   2005 	eor	v5.16b, v10.16b, v9.16b
   2006 	str	q9, [x2], #16
   2007 	ext	v10.16b, v18.16b, v18.16b, #8
   2008 	cmtst	v9.2d, v9.2d, v6.2d
   2009 	and	v9.16b, v9.16b, v8.16b
   2010 	eor	v10.16b, v17.16b, v10.16b
   2011 	cmtst	v17.2d, v10.2d, v6.2d
   2012 	eor	v6.16b, v16.16b, v10.16b
   2013 	str	q10, [x2], #16
   2014 	ext	v9.16b, v9.16b, v9.16b, #8
   2015 	add	v10.2d, v10.2d, v10.2d
   2016 	eor	v9.16b, v10.16b, v9.16b
   2017 	str	q9, [x2], #16
   2018 	eor	v7.16b, v7.16b, v9.16b
   2019 	add	v9.2d, v9.2d, v9.2d
   2020 	and	v8.16b, v17.16b, v8.16b
   2021 	ext	v8.16b, v8.16b, v8.16b, #8
   2022 	eor	v8.16b, v9.16b, v8.16b
   2023 	str	q8, [x2]                    // next round tweak
   2024 
   2025 	bl	_bsaes_decrypt8
   2026 
   2027 	eor	v6.16b, v6.16b, v13.16b
   2028 	eor	v0.16b, v0.16b, v11.16b
   2029 	ldr	q8, [x0], #16
   2030 	eor	v7.16b, v7.16b, v8.16b
   2031 	str	q0, [x21], #16
   2032 	eor	v0.16b, v1.16b, v12.16b
   2033 	ldr	q1, [x0], #16
   2034 	eor	v1.16b, v3.16b, v1.16b
   2035 	subs	x22, x22, #0x80
   2036 	eor	v2.16b, v2.16b, v15.16b
   2037 	eor	v3.16b, v4.16b, v14.16b
   2038 	ldr	q4, [x0], #16
   2039 	str	q0, [x21], #16
   2040 	ldr	q11, [x0]                   // next round tweak
   2041 	eor	v0.16b, v5.16b, v4.16b
   2042 	str	q6, [x21], #16
   2043 	str	q3, [x21], #16
   2044 	str	q2, [x21], #16
   2045 	str	q7, [x21], #16
   2046 	str	q1, [x21], #16
   2047 	str	q0, [x21], #16
   2048 	bpl	.Lxts_dec_loop
   2049 
   2050 .Lxts_dec_short:
   2051 	adds	x22, x22, #0x70
   2052 	bmi	.Lxts_dec_done
   2053 
   2054 	ldr	q8, .Lxts_magic
   2055 	sshr	v1.2d, v11.2d, #63
   2056 	add	v2.2d, v11.2d, v11.2d
   2057 	ldr	q9, .Lxts_magic+16
   2058 	subs	x22, x22, #0x10
   2059 	ldr	q0, [x20], #16
   2060 	and	v1.16b, v1.16b, v8.16b
   2061 	cmtst	v3.2d, v11.2d, v9.2d
   2062 	ext	v1.16b, v1.16b, v1.16b, #8
   2063 	and	v3.16b, v3.16b, v8.16b
   2064 	eor	v12.16b, v2.16b, v1.16b
   2065 	ext	v1.16b, v3.16b, v3.16b, #8
   2066 	add	v2.2d, v12.2d, v12.2d
   2067 	cmtst	v3.2d, v12.2d, v9.2d
   2068 	eor	v13.16b, v2.16b, v1.16b
   2069 	and	v22.16b, v3.16b, v8.16b
   2070 	bmi	.Lxts_dec_1
   2071 
   2072 	ext	v2.16b, v22.16b, v22.16b, #8
   2073 	add	v3.2d, v13.2d, v13.2d
   2074 	ldr	q1, [x20], #16
   2075 	cmtst	v4.2d, v13.2d, v9.2d
   2076 	subs	x22, x22, #0x10
   2077 	eor	v14.16b, v3.16b, v2.16b
   2078 	and	v23.16b, v4.16b, v8.16b
   2079 	bmi	.Lxts_dec_2
   2080 
   2081 	ext	v3.16b, v23.16b, v23.16b, #8
   2082 	add	v4.2d, v14.2d, v14.2d
   2083 	ldr	q2, [x20], #16
   2084 	cmtst	v5.2d, v14.2d, v9.2d
   2085 	eor	v0.16b, v0.16b, v11.16b
   2086 	subs	x22, x22, #0x10
   2087 	eor	v15.16b, v4.16b, v3.16b
   2088 	and	v24.16b, v5.16b, v8.16b
   2089 	bmi	.Lxts_dec_3
   2090 
   2091 	ext	v4.16b, v24.16b, v24.16b, #8
   2092 	add	v5.2d, v15.2d, v15.2d
   2093 	ldr	q3, [x20], #16
   2094 	cmtst	v6.2d, v15.2d, v9.2d
   2095 	eor	v1.16b, v1.16b, v12.16b
   2096 	subs	x22, x22, #0x10
   2097 	eor	v16.16b, v5.16b, v4.16b
   2098 	and	v25.16b, v6.16b, v8.16b
   2099 	bmi	.Lxts_dec_4
   2100 
   2101 	ext	v5.16b, v25.16b, v25.16b, #8
   2102 	add	v6.2d, v16.2d, v16.2d
   2103 	add	x0, x19, #16
   2104 	cmtst	v7.2d, v16.2d, v9.2d
   2105 	ldr	q4, [x20], #16
   2106 	eor	v2.16b, v2.16b, v13.16b
   2107 	str	q16, [x0], #16
   2108 	subs	x22, x22, #0x10
   2109 	eor	v17.16b, v6.16b, v5.16b
   2110 	and	v26.16b, v7.16b, v8.16b
   2111 	bmi	.Lxts_dec_5
   2112 
   2113 	ext	v7.16b, v26.16b, v26.16b, #8
   2114 	add	v18.2d, v17.2d, v17.2d
   2115 	ldr	q5, [x20], #16
   2116 	eor	v3.16b, v3.16b, v14.16b
   2117 	str	q17, [x0], #16
   2118 	subs	x22, x22, #0x10
   2119 	eor	v18.16b, v18.16b, v7.16b
   2120 	bmi	.Lxts_dec_6
   2121 
   2122 	ldr	q6, [x20], #16
   2123 	eor	v4.16b, v4.16b, v15.16b
   2124 	eor	v5.16b, v5.16b, v16.16b
   2125 	str	q18, [x0]                   // next round tweak
   2126 	mov	x9, sp                      // pass key schedule
   2127 	mov	x10, x1
   2128 	add	x0, x19, #16
   2129 	sub	x22, x22, #0x10
   2130 	eor	v6.16b, v6.16b, v17.16b
   2131 
   2132 	bl	_bsaes_decrypt8
   2133 
   2134 	ldr	q16, [x0], #16
   2135 	eor	v0.16b, v0.16b, v11.16b
   2136 	eor	v1.16b, v1.16b, v12.16b
   2137 	ldr	q17, [x0], #16
   2138 	eor	v6.16b, v6.16b, v13.16b
   2139 	eor	v4.16b, v4.16b, v14.16b
   2140 	eor	v2.16b, v2.16b, v15.16b
   2141 	ldr	q11, [x0]                   // next round tweak
   2142 	str	q0, [x21], #16
   2143 	str	q1, [x21], #16
   2144 	eor	v0.16b, v7.16b, v16.16b
   2145 	eor	v1.16b, v3.16b, v17.16b
   2146 	str	q6, [x21], #16
   2147 	str	q4, [x21], #16
   2148 	str	q2, [x21], #16
   2149 	str	q0, [x21], #16
   2150 	str	q1, [x21], #16
   2151 	b	.Lxts_dec_done
   2152 
   2153 .align	4
   2154 .Lxts_dec_6:
   2155 	eor	v4.16b, v4.16b, v15.16b
   2156 	eor	v5.16b, v5.16b, v16.16b
   2157 	mov	x9, sp                      // pass key schedule
   2158 	mov	x10, x1                     // pass rounds
   2159 	add	x0, x19, #16
   2160 
   2161 	bl	_bsaes_decrypt8
   2162 
   2163 	ldr	q16, [x0], #16
   2164 	eor	v0.16b, v0.16b, v11.16b
   2165 	eor	v1.16b, v1.16b, v12.16b
   2166 	eor	v6.16b, v6.16b, v13.16b
   2167 	eor	v4.16b, v4.16b, v14.16b
   2168 	ldr	q11, [x0]                   // next round tweak
   2169 	eor	v2.16b, v2.16b, v15.16b
   2170 	str	q0, [x21], #16
   2171 	str	q1, [x21], #16
   2172 	eor	v0.16b, v7.16b, v16.16b
   2173 	str	q6, [x21], #16
   2174 	str	q4, [x21], #16
   2175 	str	q2, [x21], #16
   2176 	str	q0, [x21], #16
   2177 	b	.Lxts_dec_done
   2178 
   2179 .align	4
   2180 .Lxts_dec_5:
   2181 	eor	v3.16b, v3.16b, v14.16b
   2182 	eor	v4.16b, v4.16b, v15.16b
   2183 	mov	x9, sp                      // pass key schedule
   2184 	mov	x10, x1                     // pass rounds
   2185 	add	x0, x19, #16
   2186 
   2187 	bl	_bsaes_decrypt8
   2188 
   2189 	eor	v0.16b, v0.16b, v11.16b
   2190 	eor	v1.16b, v1.16b, v12.16b
   2191 	ldr	q11, [x0]                   // next round tweak
   2192 	eor	v6.16b, v6.16b, v13.16b
   2193 	eor	v4.16b, v4.16b, v14.16b
   2194 	eor	v2.16b, v2.16b, v15.16b
   2195 	str	q0, [x21], #16
   2196 	str	q1, [x21], #16
   2197 	str	q6, [x21], #16
   2198 	str	q4, [x21], #16
   2199 	str	q2, [x21], #16
   2200 	b	.Lxts_dec_done
   2201 
   2202 .align	4
   2203 .Lxts_dec_4:
   2204 	eor	v2.16b, v2.16b, v13.16b
   2205 	eor	v3.16b, v3.16b, v14.16b
   2206 	mov	x9, sp                      // pass key schedule
   2207 	mov	x10, x1                     // pass rounds
   2208 	add	x0, x19, #16
   2209 
   2210 	bl	_bsaes_decrypt8
   2211 
   2212 	eor	v0.16b, v0.16b, v11.16b
   2213 	eor	v1.16b, v1.16b, v12.16b
   2214 	eor	v6.16b, v6.16b, v13.16b
   2215 	eor	v4.16b, v4.16b, v14.16b
   2216 	mov	v11.16b, v15.16b            // next round tweak
   2217 	str	q0, [x21], #16
   2218 	str	q1, [x21], #16
   2219 	str	q6, [x21], #16
   2220 	str	q4, [x21], #16
   2221 	b	.Lxts_dec_done
   2222 
   2223 .align	4
   2224 .Lxts_dec_3:
   2225 	eor	v1.16b, v1.16b, v12.16b
   2226 	eor	v2.16b, v2.16b, v13.16b
   2227 	mov	x9, sp                      // pass key schedule
   2228 	mov	x10, x1                     // pass rounds
   2229 	add	x0, x19, #16
   2230 
   2231 	bl	_bsaes_decrypt8
   2232 
   2233 	eor	v0.16b, v0.16b, v11.16b
   2234 	eor	v1.16b, v1.16b, v12.16b
   2235 	eor	v6.16b, v6.16b, v13.16b
   2236 	mov	v11.16b, v14.16b            // next round tweak
   2237 	str	q0, [x21], #16
   2238 	str	q1, [x21], #16
   2239 	str	q6, [x21], #16
   2240 	b	.Lxts_dec_done
   2241 
   2242 .align	4
   2243 .Lxts_dec_2:
   2244 	eor	v0.16b, v0.16b, v11.16b
   2245 	eor	v1.16b, v1.16b, v12.16b
   2246 	mov	x9, sp                      // pass key schedule
   2247 	mov	x10, x1                     // pass rounds
   2248 	add	x0, x19, #16
   2249 
   2250 	bl	_bsaes_decrypt8
   2251 
   2252 	eor	v0.16b, v0.16b, v11.16b
   2253 	eor	v1.16b, v1.16b, v12.16b
   2254 	mov	v11.16b, v13.16b            // next round tweak
   2255 	str	q0, [x21], #16
   2256 	str	q1, [x21], #16
   2257 	b	.Lxts_dec_done
   2258 
   2259 .align	4
   2260 .Lxts_dec_1:
   2261 	eor	v0.16b, v0.16b, v11.16b
   2262 	sub	x0, sp, #16
   2263 	sub	x1, sp, #16
   2264 	mov	x2, x23
   2265 	mov	v13.d[0], v11.d[1]          // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
   2266 	mov	v14.d[0], v12.d[1]
   2267 	str	q0, [sp, #-16]!
   2268 
   2269 	bl	AES_decrypt
   2270 
   2271 	ldr	q0, [sp], #16
   2272 	trn1	v13.2d, v11.2d, v13.2d
   2273 	trn1	v11.2d, v12.2d, v14.2d      // next round tweak
   2274 	eor	v0.16b, v0.16b, v13.16b
   2275 	str	q0, [x21], #16
   2276 
   2277 .Lxts_dec_done:
   2278 	adds	x22, x22, #0x10
   2279 	beq	.Lxts_dec_ret
   2280 
   2281         // calculate one round of extra tweak for the stolen ciphertext
   2282 	ldr	q8, .Lxts_magic
   2283 	sshr	v6.2d, v11.2d, #63
   2284 	and	v6.16b, v6.16b, v8.16b
   2285 	add	v12.2d, v11.2d, v11.2d
   2286 	ext	v6.16b, v6.16b, v6.16b, #8
   2287 	eor	v12.16b, v12.16b, v6.16b
   2288 
   2289         // perform the final decryption with the last tweak value
   2290 	ldr	q0, [x20], #16
   2291 	eor	v0.16b, v0.16b, v12.16b
   2292 	str	q0, [sp, #-16]!
   2293 	mov	x0, sp
   2294 	mov	x1, sp
   2295 	mov	x2, x23
   2296 	mov	v13.d[0], v11.d[1]          // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
   2297 	mov	v14.d[0], v12.d[1]
   2298 
   2299 	bl	AES_decrypt
   2300 
   2301 	trn1	v12.2d, v12.2d, v14.2d
   2302 	trn1	v11.2d, v11.2d, v13.2d
   2303 	ldr	q0, [sp], #16
   2304 	eor	v0.16b, v0.16b, v12.16b
   2305 	str	q0, [x21]
   2306 
   2307 	mov	x6, x21
   2308         // Penultimate ciphertext block produces final plaintext part-block
   2309         // plus remaining part of final ciphertext block. Move plaintext part
   2310         // to final position and reuse penultimate plaintext block buffer to
   2311         // construct final ciphertext block
   2312 .Lxts_dec_steal:
   2313 	ldrb	w1, [x21]
   2314 	ldrb	w0, [x20], #1
   2315 	strb	w1, [x21, #0x10]
   2316 	strb	w0, [x21], #1
   2317 
   2318 	subs	x22, x22, #1
   2319 	bhi	.Lxts_dec_steal
   2320 
   2321         // Finally decrypt the penultimate plaintext block using the
   2322         // penultimate tweak
   2323 	ldr	q0, [x6]
   2324 	eor	v0.16b, v0.16b, v11.16b
   2325 	str	q0, [sp, #-16]!
   2326 	mov	x0, sp
   2327 	mov	x1, sp
   2328 	mov	x2, x23
   2329 	mov	x21, x6
   2330 
   2331 	bl	AES_decrypt
   2332 
   2333 	trn1	v11.2d, v11.2d, v13.2d
   2334 	ldr	q0, [sp], #16
   2335 	eor	v0.16b, v0.16b, v11.16b
   2336 	str	q0, [x21]
   2337 
   2338 .Lxts_dec_ret:
   2339 
   2340 	movi	v0.16b, #0
   2341 	movi	v1.16b, #0
   2342 .Lxts_dec_bzero:	//	wipe key schedule
   2343 	stp	q0, q1, [sp], #32
   2344 	cmp	sp, x19
   2345 	bne	.Lxts_dec_bzero
   2346 
   2347 	ldp	x19, x20, [sp, #80]
   2348 	ldp	x21, x22, [sp, #96]
   2349 	ldr	x23, [sp, #112]
   2350 	ldp	d8, d9, [sp, #128]
   2351 	ldp	d10, d11, [sp, #144]
   2352 	ldp	d12, d13, [sp, #160]
   2353 	ldp	d14, d15, [sp, #176]
   2354 	ldp	x29, x30, [sp], #192
   2355 	ret
   2356 .size	ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt
   2357