1 #! /usr/bin/env perl 2 # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. 3 # 4 # Licensed under the OpenSSL license (the "License"). You may not use 5 # this file except in compliance with the License. You can obtain a copy 6 # in the file LICENSE in the source distribution or at 7 # https://www.openssl.org/source/license.html 8 9 10 ###################################################################### 11 ## Constant-time SSSE3 AES core implementation. 12 ## version 0.1 13 ## 14 ## By Mike Hamburg (Stanford University), 2009 15 ## Public domain. 16 ## 17 ## For details see http://shiftleft.org/papers/vector_aes/ and 18 ## http://crypto.stanford.edu/vpaes/. 19 20 # CBC encrypt/decrypt performance in cycles per byte processed with 21 # 128-bit key. 22 # 23 # aes-ppc.pl this 24 # PPC74x0/G4e 35.5/52.1/(23.8) 11.9(*)/15.4 25 # PPC970/G5 37.9/55.0/(28.5) 22.2/28.5 26 # POWER6 42.7/54.3/(28.2) 63.0/92.8(**) 27 # POWER7 32.3/42.9/(18.4) 18.5/23.3 28 # 29 # (*) This is ~10% worse than reported in paper. The reason is 30 # twofold. This module doesn't make any assumption about 31 # key schedule (or data for that matter) alignment and handles 32 # it in-line. Secondly it, being transliterated from 33 # vpaes-x86_64.pl, relies on "nested inversion" better suited 34 # for Intel CPUs. 35 # (**) Inadequate POWER6 performance is due to astronomic AltiVec 36 # latency, 9 cycles per simple logical operation. 37 38 $flavour = shift; 39 40 if ($flavour =~ /64/) { 41 $SIZE_T =8; 42 $LRSAVE =2*$SIZE_T; 43 $STU ="stdu"; 44 $POP ="ld"; 45 $PUSH ="std"; 46 $UCMP ="cmpld"; 47 } elsif ($flavour =~ /32/) { 48 $SIZE_T =4; 49 $LRSAVE =$SIZE_T; 50 $STU ="stwu"; 51 $POP ="lwz"; 52 $PUSH ="stw"; 53 $UCMP ="cmplw"; 54 } else { die "nonsense $flavour"; } 55 56 $sp="r1"; 57 $FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload 58 59 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 60 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 61 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 62 die "can't locate ppc-xlate.pl"; 63 64 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 65 66 $code.=<<___; 67 .machine "any" 68 69 .text 70 71 .align 7 # totally strategic alignment 72 _vpaes_consts: 73 Lk_mc_forward: # mc_forward 74 .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c ?inv 75 .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300 ?inv 76 .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704 ?inv 77 .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08 ?inv 78 Lk_mc_backward: # mc_backward 79 .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e ?inv 80 .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a ?inv 81 .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506 ?inv 82 .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102 ?inv 83 Lk_sr: # sr 84 .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f ?inv 85 .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b ?inv 86 .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07 ?inv 87 .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603 ?inv 88 89 ## 90 ## "Hot" constants 91 ## 92 Lk_inv: # inv, inva 93 .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704 ?rev 94 .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03 ?rev 95 Lk_ipt: # input transform (lo, hi) 96 .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca ?rev 97 .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd ?rev 98 Lk_sbo: # sbou, sbot 99 .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15 ?rev 100 .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e ?rev 101 Lk_sb1: # sb1u, sb1t 102 .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b ?rev 103 .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5 ?rev 104 Lk_sb2: # sb2u, sb2t 105 .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2 ?rev 106 .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e ?rev 107 108 ## 109 ## Decryption stuff 110 ## 111 Lk_dipt: # decryption input transform 112 .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15 ?rev 113 .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712 ?rev 114 Lk_dsbo: # decryption sbox final output 115 .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7 ?rev 116 .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca ?rev 117 Lk_dsb9: # decryption sbox output *9*u, *9*t 118 .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca ?rev 119 .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72 ?rev 120 Lk_dsbd: # decryption sbox output *D*u, *D*t 121 .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5 ?rev 122 .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129 ?rev 123 Lk_dsbb: # decryption sbox output *B*u, *B*t 124 .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660 ?rev 125 .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3 ?rev 126 Lk_dsbe: # decryption sbox output *E*u, *E*t 127 .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222 ?rev 128 .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794 ?rev 129 130 ## 131 ## Key schedule constants 132 ## 133 Lk_dksd: # decryption key schedule: invskew x*D 134 .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007 ?rev 135 .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f ?rev 136 Lk_dksb: # decryption key schedule: invskew x*B 137 .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603 ?rev 138 .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9 ?rev 139 Lk_dkse: # decryption key schedule: invskew x*E + 0x63 140 .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553 ?rev 141 .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd ?rev 142 Lk_dks9: # decryption key schedule: invskew x*9 143 .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a ?rev 144 .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b ?rev 145 146 Lk_rcon: # rcon 147 .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70 ?asis 148 Lk_s63: 149 .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b ?asis 150 151 Lk_opt: # output transform 152 .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7 ?rev 153 .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1 ?rev 154 Lk_deskew: # deskew tables: inverts the sbox's "skew" 155 .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d ?rev 156 .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128 ?rev 157 .align 5 158 Lconsts: 159 mflr r0 160 bcl 20,31,\$+4 161 mflr r12 #vvvvv "distance between . and _vpaes_consts 162 addi r12,r12,-0x308 163 mtlr r0 164 blr 165 .long 0 166 .byte 0,12,0x14,0,0,0,0,0 167 .asciz "Vector Permutation AES for AltiVec, Mike Hamburg (Stanford University)" 168 .align 6 169 ___ 170 172 my ($inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm) = map("v$_",(26..31)); 173 { 174 my ($inp,$out,$key) = map("r$_",(3..5)); 175 176 my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_",(10..15)); 177 my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_",(16..19)); 178 my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_",(16..23)); 179 180 $code.=<<___; 181 ## 182 ## _aes_preheat 183 ## 184 ## Fills register %r10 -> .aes_consts (so you can -fPIC) 185 ## and %xmm9-%xmm15 as specified below. 186 ## 187 .align 4 188 _vpaes_encrypt_preheat: 189 mflr r8 190 bl Lconsts 191 mtlr r8 192 li r11, 0xc0 # Lk_inv 193 li r10, 0xd0 194 li r9, 0xe0 # Lk_ipt 195 li r8, 0xf0 196 vxor v7, v7, v7 # 0x00..00 197 vspltisb v8,4 # 0x04..04 198 vspltisb v9,0x0f # 0x0f..0f 199 lvx $invlo, r12, r11 200 li r11, 0x100 201 lvx $invhi, r12, r10 202 li r10, 0x110 203 lvx $iptlo, r12, r9 204 li r9, 0x120 205 lvx $ipthi, r12, r8 206 li r8, 0x130 207 lvx $sbou, r12, r11 208 li r11, 0x140 209 lvx $sbot, r12, r10 210 li r10, 0x150 211 lvx $sb1u, r12, r9 212 lvx $sb1t, r12, r8 213 lvx $sb2u, r12, r11 214 lvx $sb2t, r12, r10 215 blr 216 .long 0 217 .byte 0,12,0x14,0,0,0,0,0 218 219 ## 220 ## _aes_encrypt_core 221 ## 222 ## AES-encrypt %xmm0. 223 ## 224 ## Inputs: 225 ## %xmm0 = input 226 ## %xmm9-%xmm15 as in _vpaes_preheat 227 ## (%rdx) = scheduled keys 228 ## 229 ## Output in %xmm0 230 ## Clobbers %xmm1-%xmm6, %r9, %r10, %r11, %rax 231 ## 232 ## 233 .align 5 234 _vpaes_encrypt_core: 235 lwz r8, 240($key) # pull rounds 236 li r9, 16 237 lvx v5, 0, $key # vmovdqu (%r9), %xmm5 # round0 key 238 li r11, 0x10 239 lvx v6, r9, $key 240 addi r9, r9, 16 241 ?vperm v5, v5, v6, $keyperm # align round key 242 addi r10, r11, 0x40 243 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 244 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm1 245 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm3, %xmm2 246 vxor v0, v0, v5 # vpxor %xmm5, %xmm1, %xmm0 247 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0 248 mtctr r8 249 b Lenc_entry 250 251 .align 4 252 Lenc_loop: 253 # middle of middle round 254 vperm v4, $sb1t, v7, v2 # vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 255 lvx v1, r12, r11 # vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 256 addi r11, r11, 16 257 vperm v0, $sb1u, v7, v3 # vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 258 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 259 andi. r11, r11, 0x30 # and \$0x30, %r11 # ... mod 4 260 vperm v5, $sb2t, v7, v2 # vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 261 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A 262 vperm v2, $sb2u, v7, v3 # vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 263 lvx v4, r12, r10 # vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 264 addi r10, r11, 0x40 265 vperm v3, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 266 vxor v2, v2, v5 # vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 267 vperm v0, v0, v7, v4 # vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 268 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 269 vperm v4, v3, v7, v1 # vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 270 vxor v0, v0, v3 # vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 271 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 272 273 Lenc_entry: 274 # top of round 275 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i 276 vperm v5, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 277 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j 278 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 279 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 280 vand v0, v0, v9 281 vxor v3, v3, v5 # vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 282 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 283 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 284 vmr v5, v6 285 lvx v6, r9, $key # vmovdqu (%r9), %xmm5 286 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 287 addi r9, r9, 16 288 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io 289 ?vperm v5, v5, v6, $keyperm # align round key 290 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 291 bdnz Lenc_loop 292 293 # middle of last round 294 addi r10, r11, 0x80 295 # vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 296 # vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 297 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 298 lvx v1, r12, r10 # vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 299 vperm v0, $sbot, v7, v3 # vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 300 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 301 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A 302 vperm v0, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm0 303 blr 304 .long 0 305 .byte 0,12,0x14,0,0,0,0,0 306 307 .globl .vpaes_encrypt 308 .align 5 309 .vpaes_encrypt: 310 $STU $sp,-$FRAME($sp) 311 li r10,`15+6*$SIZE_T` 312 li r11,`31+6*$SIZE_T` 313 mflr r6 314 mfspr r7, 256 # save vrsave 315 stvx v20,r10,$sp 316 addi r10,r10,32 317 stvx v21,r11,$sp 318 addi r11,r11,32 319 stvx v22,r10,$sp 320 addi r10,r10,32 321 stvx v23,r11,$sp 322 addi r11,r11,32 323 stvx v24,r10,$sp 324 addi r10,r10,32 325 stvx v25,r11,$sp 326 addi r11,r11,32 327 stvx v26,r10,$sp 328 addi r10,r10,32 329 stvx v27,r11,$sp 330 addi r11,r11,32 331 stvx v28,r10,$sp 332 addi r10,r10,32 333 stvx v29,r11,$sp 334 addi r11,r11,32 335 stvx v30,r10,$sp 336 stvx v31,r11,$sp 337 stw r7,`$FRAME-4`($sp) # save vrsave 338 li r0, -1 339 $PUSH r6,`$FRAME+$LRSAVE`($sp) 340 mtspr 256, r0 # preserve all AltiVec registers 341 342 bl _vpaes_encrypt_preheat 343 344 ?lvsl $inpperm, 0, $inp # prepare for unaligned access 345 lvx v0, 0, $inp 346 addi $inp, $inp, 15 # 15 is not a typo 347 ?lvsr $outperm, 0, $out 348 ?lvsl $keyperm, 0, $key # prepare for unaligned access 349 lvx $inptail, 0, $inp # redundant in aligned case 350 ?vperm v0, v0, $inptail, $inpperm 351 352 bl _vpaes_encrypt_core 353 354 andi. r8, $out, 15 355 li r9, 16 356 beq Lenc_out_aligned 357 358 vperm v0, v0, v0, $outperm # rotate right/left 359 mtctr r9 360 Lenc_out_unaligned: 361 stvebx v0, 0, $out 362 addi $out, $out, 1 363 bdnz Lenc_out_unaligned 364 b Lenc_done 365 366 .align 4 367 Lenc_out_aligned: 368 stvx v0, 0, $out 369 Lenc_done: 370 371 li r10,`15+6*$SIZE_T` 372 li r11,`31+6*$SIZE_T` 373 mtlr r6 374 mtspr 256, r7 # restore vrsave 375 lvx v20,r10,$sp 376 addi r10,r10,32 377 lvx v21,r11,$sp 378 addi r11,r11,32 379 lvx v22,r10,$sp 380 addi r10,r10,32 381 lvx v23,r11,$sp 382 addi r11,r11,32 383 lvx v24,r10,$sp 384 addi r10,r10,32 385 lvx v25,r11,$sp 386 addi r11,r11,32 387 lvx v26,r10,$sp 388 addi r10,r10,32 389 lvx v27,r11,$sp 390 addi r11,r11,32 391 lvx v28,r10,$sp 392 addi r10,r10,32 393 lvx v29,r11,$sp 394 addi r11,r11,32 395 lvx v30,r10,$sp 396 lvx v31,r11,$sp 397 addi $sp,$sp,$FRAME 398 blr 399 .long 0 400 .byte 0,12,0x04,1,0x80,0,3,0 401 .long 0 402 .size .vpaes_encrypt,.-.vpaes_encrypt 403 404 .align 4 405 _vpaes_decrypt_preheat: 406 mflr r8 407 bl Lconsts 408 mtlr r8 409 li r11, 0xc0 # Lk_inv 410 li r10, 0xd0 411 li r9, 0x160 # Ldipt 412 li r8, 0x170 413 vxor v7, v7, v7 # 0x00..00 414 vspltisb v8,4 # 0x04..04 415 vspltisb v9,0x0f # 0x0f..0f 416 lvx $invlo, r12, r11 417 li r11, 0x180 418 lvx $invhi, r12, r10 419 li r10, 0x190 420 lvx $iptlo, r12, r9 421 li r9, 0x1a0 422 lvx $ipthi, r12, r8 423 li r8, 0x1b0 424 lvx $sbou, r12, r11 425 li r11, 0x1c0 426 lvx $sbot, r12, r10 427 li r10, 0x1d0 428 lvx $sb9u, r12, r9 429 li r9, 0x1e0 430 lvx $sb9t, r12, r8 431 li r8, 0x1f0 432 lvx $sbdu, r12, r11 433 li r11, 0x200 434 lvx $sbdt, r12, r10 435 li r10, 0x210 436 lvx $sbbu, r12, r9 437 lvx $sbbt, r12, r8 438 lvx $sbeu, r12, r11 439 lvx $sbet, r12, r10 440 blr 441 .long 0 442 .byte 0,12,0x14,0,0,0,0,0 443 444 ## 445 ## Decryption core 446 ## 447 ## Same API as encryption core. 448 ## 449 .align 4 450 _vpaes_decrypt_core: 451 lwz r8, 240($key) # pull rounds 452 li r9, 16 453 lvx v5, 0, $key # vmovdqu (%r9), %xmm4 # round0 key 454 li r11, 0x30 455 lvx v6, r9, $key 456 addi r9, r9, 16 457 ?vperm v5, v5, v6, $keyperm # align round key 458 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 459 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2 460 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm1, %xmm0 461 vxor v0, v0, v5 # vpxor %xmm4, %xmm2, %xmm2 462 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0 463 mtctr r8 464 b Ldec_entry 465 466 .align 4 467 Ldec_loop: 468 # 469 # Inverse mix columns 470 # 471 lvx v0, r12, r11 # v5 and v0 are flipped 472 # vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 473 # vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 474 vperm v4, $sb9u, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 475 subi r11, r11, 16 476 vperm v1, $sb9t, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 477 andi. r11, r11, 0x30 478 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 479 # vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 480 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 481 # vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 482 483 vperm v4, $sbdu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 484 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch 485 vperm v1, $sbdt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 486 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 487 # vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 488 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 489 # vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 490 491 vperm v4, $sbbu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 492 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch 493 vperm v1, $sbbt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 494 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 495 # vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 496 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 497 # vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 498 499 vperm v4, $sbeu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 500 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch 501 vperm v1, $sbet, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 502 vxor v0, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 503 vxor v0, v0, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 504 505 Ldec_entry: 506 # top of round 507 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i 508 vperm v2, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 509 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j 510 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 511 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 512 vand v0, v0, v9 513 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 514 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 515 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 516 vmr v5, v6 517 lvx v6, r9, $key # vmovdqu (%r9), %xmm0 518 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 519 addi r9, r9, 16 520 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io 521 ?vperm v5, v5, v6, $keyperm # align round key 522 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 523 bdnz Ldec_loop 524 525 # middle of last round 526 addi r10, r11, 0x80 527 # vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 528 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 529 # vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 530 lvx v2, r12, r10 # vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 531 vperm v1, $sbot, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 532 vxor v4, v4, v5 # vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 533 vxor v0, v1, v4 # vpxor %xmm4, %xmm1, %xmm0 # 0 = A 534 vperm v0, v0, v7, v2 # vpshufb %xmm2, %xmm0, %xmm0 535 blr 536 .long 0 537 .byte 0,12,0x14,0,0,0,0,0 538 539 .globl .vpaes_decrypt 540 .align 5 541 .vpaes_decrypt: 542 $STU $sp,-$FRAME($sp) 543 li r10,`15+6*$SIZE_T` 544 li r11,`31+6*$SIZE_T` 545 mflr r6 546 mfspr r7, 256 # save vrsave 547 stvx v20,r10,$sp 548 addi r10,r10,32 549 stvx v21,r11,$sp 550 addi r11,r11,32 551 stvx v22,r10,$sp 552 addi r10,r10,32 553 stvx v23,r11,$sp 554 addi r11,r11,32 555 stvx v24,r10,$sp 556 addi r10,r10,32 557 stvx v25,r11,$sp 558 addi r11,r11,32 559 stvx v26,r10,$sp 560 addi r10,r10,32 561 stvx v27,r11,$sp 562 addi r11,r11,32 563 stvx v28,r10,$sp 564 addi r10,r10,32 565 stvx v29,r11,$sp 566 addi r11,r11,32 567 stvx v30,r10,$sp 568 stvx v31,r11,$sp 569 stw r7,`$FRAME-4`($sp) # save vrsave 570 li r0, -1 571 $PUSH r6,`$FRAME+$LRSAVE`($sp) 572 mtspr 256, r0 # preserve all AltiVec registers 573 574 bl _vpaes_decrypt_preheat 575 576 ?lvsl $inpperm, 0, $inp # prepare for unaligned access 577 lvx v0, 0, $inp 578 addi $inp, $inp, 15 # 15 is not a typo 579 ?lvsr $outperm, 0, $out 580 ?lvsl $keyperm, 0, $key 581 lvx $inptail, 0, $inp # redundant in aligned case 582 ?vperm v0, v0, $inptail, $inpperm 583 584 bl _vpaes_decrypt_core 585 586 andi. r8, $out, 15 587 li r9, 16 588 beq Ldec_out_aligned 589 590 vperm v0, v0, v0, $outperm # rotate right/left 591 mtctr r9 592 Ldec_out_unaligned: 593 stvebx v0, 0, $out 594 addi $out, $out, 1 595 bdnz Ldec_out_unaligned 596 b Ldec_done 597 598 .align 4 599 Ldec_out_aligned: 600 stvx v0, 0, $out 601 Ldec_done: 602 603 li r10,`15+6*$SIZE_T` 604 li r11,`31+6*$SIZE_T` 605 mtlr r6 606 mtspr 256, r7 # restore vrsave 607 lvx v20,r10,$sp 608 addi r10,r10,32 609 lvx v21,r11,$sp 610 addi r11,r11,32 611 lvx v22,r10,$sp 612 addi r10,r10,32 613 lvx v23,r11,$sp 614 addi r11,r11,32 615 lvx v24,r10,$sp 616 addi r10,r10,32 617 lvx v25,r11,$sp 618 addi r11,r11,32 619 lvx v26,r10,$sp 620 addi r10,r10,32 621 lvx v27,r11,$sp 622 addi r11,r11,32 623 lvx v28,r10,$sp 624 addi r10,r10,32 625 lvx v29,r11,$sp 626 addi r11,r11,32 627 lvx v30,r10,$sp 628 lvx v31,r11,$sp 629 addi $sp,$sp,$FRAME 630 blr 631 .long 0 632 .byte 0,12,0x04,1,0x80,0,3,0 633 .long 0 634 .size .vpaes_decrypt,.-.vpaes_decrypt 635 636 .globl .vpaes_cbc_encrypt 637 .align 5 638 .vpaes_cbc_encrypt: 639 ${UCMP}i r5,16 640 bltlr- 641 642 $STU $sp,-`($FRAME+2*$SIZE_T)`($sp) 643 mflr r0 644 li r10,`15+6*$SIZE_T` 645 li r11,`31+6*$SIZE_T` 646 mfspr r12, 256 647 stvx v20,r10,$sp 648 addi r10,r10,32 649 stvx v21,r11,$sp 650 addi r11,r11,32 651 stvx v22,r10,$sp 652 addi r10,r10,32 653 stvx v23,r11,$sp 654 addi r11,r11,32 655 stvx v24,r10,$sp 656 addi r10,r10,32 657 stvx v25,r11,$sp 658 addi r11,r11,32 659 stvx v26,r10,$sp 660 addi r10,r10,32 661 stvx v27,r11,$sp 662 addi r11,r11,32 663 stvx v28,r10,$sp 664 addi r10,r10,32 665 stvx v29,r11,$sp 666 addi r11,r11,32 667 stvx v30,r10,$sp 668 stvx v31,r11,$sp 669 stw r12,`$FRAME-4`($sp) # save vrsave 670 $PUSH r30,`$FRAME+$SIZE_T*0`($sp) 671 $PUSH r31,`$FRAME+$SIZE_T*1`($sp) 672 li r9, -16 673 $PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp) 674 675 and r30, r5, r9 # copy length&-16 676 andi. r9, $out, 15 # is $out aligned? 677 mr r5, r6 # copy pointer to key 678 mr r31, r7 # copy pointer to iv 679 li r6, -1 680 mcrf cr1, cr0 # put aside $out alignment flag 681 mr r7, r12 # copy vrsave 682 mtspr 256, r6 # preserve all AltiVec registers 683 684 lvx v24, 0, r31 # load [potentially unaligned] iv 685 li r9, 15 686 ?lvsl $inpperm, 0, r31 687 lvx v25, r9, r31 688 ?vperm v24, v24, v25, $inpperm 689 690 cmpwi r8, 0 # test direction 691 neg r8, $inp # prepare for unaligned access 692 vxor v7, v7, v7 693 ?lvsl $keyperm, 0, $key 694 ?lvsr $outperm, 0, $out 695 ?lvsr $inpperm, 0, r8 # -$inp 696 vnor $outmask, v7, v7 # 0xff..ff 697 lvx $inptail, 0, $inp 698 ?vperm $outmask, v7, $outmask, $outperm 699 addi $inp, $inp, 15 # 15 is not a typo 700 701 beq Lcbc_decrypt 702 703 bl _vpaes_encrypt_preheat 704 li r0, 16 705 706 beq cr1, Lcbc_enc_loop # $out is aligned 707 708 vmr v0, $inptail 709 lvx $inptail, 0, $inp 710 addi $inp, $inp, 16 711 ?vperm v0, v0, $inptail, $inpperm 712 vxor v0, v0, v24 # ^= iv 713 714 bl _vpaes_encrypt_core 715 716 andi. r8, $out, 15 717 vmr v24, v0 # put aside iv 718 sub r9, $out, r8 719 vperm $outhead, v0, v0, $outperm # rotate right/left 720 721 Lcbc_enc_head: 722 stvebx $outhead, r8, r9 723 cmpwi r8, 15 724 addi r8, r8, 1 725 bne Lcbc_enc_head 726 727 sub. r30, r30, r0 # len -= 16 728 addi $out, $out, 16 729 beq Lcbc_unaligned_done 730 731 Lcbc_enc_loop: 732 vmr v0, $inptail 733 lvx $inptail, 0, $inp 734 addi $inp, $inp, 16 735 ?vperm v0, v0, $inptail, $inpperm 736 vxor v0, v0, v24 # ^= iv 737 738 bl _vpaes_encrypt_core 739 740 vmr v24, v0 # put aside iv 741 sub. r30, r30, r0 # len -= 16 742 vperm v0, v0, v0, $outperm # rotate right/left 743 vsel v1, $outhead, v0, $outmask 744 vmr $outhead, v0 745 stvx v1, 0, $out 746 addi $out, $out, 16 747 bne Lcbc_enc_loop 748 749 b Lcbc_done 750 751 .align 5 752 Lcbc_decrypt: 753 bl _vpaes_decrypt_preheat 754 li r0, 16 755 756 beq cr1, Lcbc_dec_loop # $out is aligned 757 758 vmr v0, $inptail 759 lvx $inptail, 0, $inp 760 addi $inp, $inp, 16 761 ?vperm v0, v0, $inptail, $inpperm 762 vmr v25, v0 # put aside input 763 764 bl _vpaes_decrypt_core 765 766 andi. r8, $out, 15 767 vxor v0, v0, v24 # ^= iv 768 vmr v24, v25 769 sub r9, $out, r8 770 vperm $outhead, v0, v0, $outperm # rotate right/left 771 772 Lcbc_dec_head: 773 stvebx $outhead, r8, r9 774 cmpwi r8, 15 775 addi r8, r8, 1 776 bne Lcbc_dec_head 777 778 sub. r30, r30, r0 # len -= 16 779 addi $out, $out, 16 780 beq Lcbc_unaligned_done 781 782 Lcbc_dec_loop: 783 vmr v0, $inptail 784 lvx $inptail, 0, $inp 785 addi $inp, $inp, 16 786 ?vperm v0, v0, $inptail, $inpperm 787 vmr v25, v0 # put aside input 788 789 bl _vpaes_decrypt_core 790 791 vxor v0, v0, v24 # ^= iv 792 vmr v24, v25 793 sub. r30, r30, r0 # len -= 16 794 vperm v0, v0, v0, $outperm # rotate right/left 795 vsel v1, $outhead, v0, $outmask 796 vmr $outhead, v0 797 stvx v1, 0, $out 798 addi $out, $out, 16 799 bne Lcbc_dec_loop 800 801 Lcbc_done: 802 beq cr1, Lcbc_write_iv # $out is aligned 803 804 Lcbc_unaligned_done: 805 andi. r8, $out, 15 806 sub $out, $out, r8 807 li r9, 0 808 Lcbc_tail: 809 stvebx $outhead, r9, $out 810 addi r9, r9, 1 811 cmpw r9, r8 812 bne Lcbc_tail 813 814 Lcbc_write_iv: 815 neg r8, r31 # write [potentially unaligned] iv 816 li r10, 4 817 ?lvsl $outperm, 0, r8 818 li r11, 8 819 li r12, 12 820 vperm v24, v24, v24, $outperm # rotate right/left 821 stvewx v24, 0, r31 # ivp is at least 32-bit aligned 822 stvewx v24, r10, r31 823 stvewx v24, r11, r31 824 stvewx v24, r12, r31 825 826 mtspr 256, r7 # restore vrsave 827 li r10,`15+6*$SIZE_T` 828 li r11,`31+6*$SIZE_T` 829 lvx v20,r10,$sp 830 addi r10,r10,32 831 lvx v21,r11,$sp 832 addi r11,r11,32 833 lvx v22,r10,$sp 834 addi r10,r10,32 835 lvx v23,r11,$sp 836 addi r11,r11,32 837 lvx v24,r10,$sp 838 addi r10,r10,32 839 lvx v25,r11,$sp 840 addi r11,r11,32 841 lvx v26,r10,$sp 842 addi r10,r10,32 843 lvx v27,r11,$sp 844 addi r11,r11,32 845 lvx v28,r10,$sp 846 addi r10,r10,32 847 lvx v29,r11,$sp 848 addi r11,r11,32 849 lvx v30,r10,$sp 850 lvx v31,r11,$sp 851 Lcbc_abort: 852 $POP r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp) 853 $POP r30,`$FRAME+$SIZE_T*0`($sp) 854 $POP r31,`$FRAME+$SIZE_T*1`($sp) 855 mtlr r0 856 addi $sp,$sp,`$FRAME+$SIZE_T*2` 857 blr 858 .long 0 859 .byte 0,12,0x04,1,0x80,2,6,0 860 .long 0 861 .size .vpaes_cbc_encrypt,.-.vpaes_cbc_encrypt 862 ___ 863 } 865 { 866 my ($inp,$bits,$out)=map("r$_",(3..5)); 867 my $dir="cr1"; 868 my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_",(10..13,24)); 869 870 $code.=<<___; 871 ######################################################## 872 ## ## 873 ## AES key schedule ## 874 ## ## 875 ######################################################## 876 .align 4 877 _vpaes_key_preheat: 878 mflr r8 879 bl Lconsts 880 mtlr r8 881 li r11, 0xc0 # Lk_inv 882 li r10, 0xd0 883 li r9, 0xe0 # L_ipt 884 li r8, 0xf0 885 886 vspltisb v8,4 # 0x04..04 887 vxor v9,v9,v9 # 0x00..00 888 lvx $invlo, r12, r11 # Lk_inv 889 li r11, 0x120 890 lvx $invhi, r12, r10 891 li r10, 0x130 892 lvx $iptlo, r12, r9 # Lk_ipt 893 li r9, 0x220 894 lvx $ipthi, r12, r8 895 li r8, 0x230 896 897 lvx v14, r12, r11 # Lk_sb1 898 li r11, 0x240 899 lvx v15, r12, r10 900 li r10, 0x250 901 902 lvx v16, r12, r9 # Lk_dksd 903 li r9, 0x260 904 lvx v17, r12, r8 905 li r8, 0x270 906 lvx v18, r12, r11 # Lk_dksb 907 li r11, 0x280 908 lvx v19, r12, r10 909 li r10, 0x290 910 lvx v20, r12, r9 # Lk_dkse 911 li r9, 0x2a0 912 lvx v21, r12, r8 913 li r8, 0x2b0 914 lvx v22, r12, r11 # Lk_dks9 915 lvx v23, r12, r10 916 917 lvx v24, r12, r9 # Lk_rcon 918 lvx v25, 0, r12 # Lk_mc_forward[0] 919 lvx v26, r12, r8 # Lks63 920 blr 921 .long 0 922 .byte 0,12,0x14,0,0,0,0,0 923 924 .align 4 925 _vpaes_schedule_core: 926 mflr r7 927 928 bl _vpaes_key_preheat # load the tables 929 930 #lvx v0, 0, $inp # vmovdqu (%rdi), %xmm0 # load key (unaligned) 931 neg r8, $inp # prepare for unaligned access 932 lvx v0, 0, $inp 933 addi $inp, $inp, 15 # 15 is not typo 934 ?lvsr $inpperm, 0, r8 # -$inp 935 lvx v6, 0, $inp # v6 serves as inptail 936 addi $inp, $inp, 8 937 ?vperm v0, v0, v6, $inpperm 938 939 # input transform 940 vmr v3, v0 # vmovdqa %xmm0, %xmm3 941 bl _vpaes_schedule_transform 942 vmr v7, v0 # vmovdqa %xmm0, %xmm7 943 944 bne $dir, Lschedule_am_decrypting 945 946 # encrypting, output zeroth round key after transform 947 li r8, 0x30 # mov \$0x30,%r8d 948 li r9, 4 949 li r10, 8 950 li r11, 12 951 952 ?lvsr $outperm, 0, $out # prepare for unaligned access 953 vnor $outmask, v9, v9 # 0xff..ff 954 ?vperm $outmask, v9, $outmask, $outperm 955 956 #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx) 957 vperm $outhead, v0, v0, $outperm # rotate right/left 958 stvewx $outhead, 0, $out # some are superfluous 959 stvewx $outhead, r9, $out 960 stvewx $outhead, r10, $out 961 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 962 stvewx $outhead, r11, $out 963 b Lschedule_go 964 965 Lschedule_am_decrypting: 966 srwi r8, $bits, 1 # shr \$1,%r8d 967 andi. r8, r8, 32 # and \$32,%r8d 968 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32 969 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 970 # decrypting, output zeroth round key after shiftrows 971 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1 972 li r9, 4 973 li r10, 8 974 li r11, 12 975 vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3 976 977 neg r0, $out # prepare for unaligned access 978 ?lvsl $outperm, 0, r0 979 vnor $outmask, v9, v9 # 0xff..ff 980 ?vperm $outmask, $outmask, v9, $outperm 981 982 #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx) 983 vperm $outhead, v4, v4, $outperm # rotate right/left 984 stvewx $outhead, 0, $out # some are superfluous 985 stvewx $outhead, r9, $out 986 stvewx $outhead, r10, $out 987 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 988 stvewx $outhead, r11, $out 989 addi $out, $out, 15 # 15 is not typo 990 xori r8, r8, 0x30 # xor \$0x30, %r8 991 992 Lschedule_go: 993 cmplwi $bits, 192 # cmp \$192, %esi 994 bgt Lschedule_256 995 beq Lschedule_192 996 # 128: fall though 997 998 ## 999 ## .schedule_128 1000 ## 1001 ## 128-bit specific part of key schedule. 1002 ## 1003 ## This schedule is really simple, because all its parts 1004 ## are accomplished by the subroutines. 1005 ## 1006 Lschedule_128: 1007 li r0, 10 # mov \$10, %esi 1008 mtctr r0 1009 1010 Loop_schedule_128: 1011 bl _vpaes_schedule_round 1012 bdz Lschedule_mangle_last # dec %esi 1013 bl _vpaes_schedule_mangle # write output 1014 b Loop_schedule_128 1015 1016 ## 1017 ## .aes_schedule_192 1018 ## 1019 ## 192-bit specific part of key schedule. 1020 ## 1021 ## The main body of this schedule is the same as the 128-bit 1022 ## schedule, but with more smearing. The long, high side is 1023 ## stored in %xmm7 as before, and the short, low side is in 1024 ## the high bits of %xmm6. 1025 ## 1026 ## This schedule is somewhat nastier, however, because each 1027 ## round produces 192 bits of key material, or 1.5 round keys. 1028 ## Therefore, on each cycle we do 2 rounds and produce 3 round 1029 ## keys. 1030 ## 1031 .align 4 1032 Lschedule_192: 1033 li r0, 4 # mov \$4, %esi 1034 lvx v0, 0, $inp 1035 ?vperm v0, v6, v0, $inpperm 1036 ?vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) 1037 bl _vpaes_schedule_transform # input transform 1038 ?vsldoi v6, v0, v9, 8 1039 ?vsldoi v6, v9, v6, 8 # clobber "low" side with zeros 1040 mtctr r0 1041 1042 Loop_schedule_192: 1043 bl _vpaes_schedule_round 1044 ?vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0 1045 bl _vpaes_schedule_mangle # save key n 1046 bl _vpaes_schedule_192_smear 1047 bl _vpaes_schedule_mangle # save key n+1 1048 bl _vpaes_schedule_round 1049 bdz Lschedule_mangle_last # dec %esi 1050 bl _vpaes_schedule_mangle # save key n+2 1051 bl _vpaes_schedule_192_smear 1052 b Loop_schedule_192 1053 1054 ## 1055 ## .aes_schedule_256 1056 ## 1057 ## 256-bit specific part of key schedule. 1058 ## 1059 ## The structure here is very similar to the 128-bit 1060 ## schedule, but with an additional "low side" in 1061 ## %xmm6. The low side's rounds are the same as the 1062 ## high side's, except no rcon and no rotation. 1063 ## 1064 .align 4 1065 Lschedule_256: 1066 li r0, 7 # mov \$7, %esi 1067 addi $inp, $inp, 8 1068 lvx v0, 0, $inp # vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) 1069 ?vperm v0, v6, v0, $inpperm 1070 bl _vpaes_schedule_transform # input transform 1071 mtctr r0 1072 1073 Loop_schedule_256: 1074 bl _vpaes_schedule_mangle # output low result 1075 vmr v6, v0 # vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 1076 1077 # high round 1078 bl _vpaes_schedule_round 1079 bdz Lschedule_mangle_last # dec %esi 1080 bl _vpaes_schedule_mangle 1081 1082 # low round. swap xmm7 and xmm6 1083 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0 1084 vmr v5, v7 # vmovdqa %xmm7, %xmm5 1085 vmr v7, v6 # vmovdqa %xmm6, %xmm7 1086 bl _vpaes_schedule_low_round 1087 vmr v7, v5 # vmovdqa %xmm5, %xmm7 1088 1089 b Loop_schedule_256 1090 ## 1091 ## .aes_schedule_mangle_last 1092 ## 1093 ## Mangler for last round of key schedule 1094 ## Mangles %xmm0 1095 ## when encrypting, outputs out(%xmm0) ^ 63 1096 ## when decrypting, outputs unskew(%xmm0) 1097 ## 1098 ## Always called right before return... jumps to cleanup and exits 1099 ## 1100 .align 4 1101 Lschedule_mangle_last: 1102 # schedule last round key from xmm0 1103 li r11, 0x2e0 # lea .Lk_deskew(%rip),%r11 1104 li r9, 0x2f0 1105 bne $dir, Lschedule_mangle_last_dec 1106 1107 # encrypting 1108 lvx v1, r8, r10 # vmovdqa (%r8,%r10),%xmm1 1109 li r11, 0x2c0 # lea .Lk_opt(%rip), %r11 # prepare to output transform 1110 li r9, 0x2d0 # prepare to output transform 1111 vperm v0, v0, v0, v1 # vpshufb %xmm1, %xmm0, %xmm0 # output permute 1112 1113 lvx $iptlo, r11, r12 # reload $ipt 1114 lvx $ipthi, r9, r12 1115 addi $out, $out, 16 # add \$16, %rdx 1116 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0 1117 bl _vpaes_schedule_transform # output transform 1118 1119 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key 1120 vperm v0, v0, v0, $outperm # rotate right/left 1121 li r10, 4 1122 vsel v2, $outhead, v0, $outmask 1123 li r11, 8 1124 stvx v2, 0, $out 1125 li r12, 12 1126 stvewx v0, 0, $out # some (or all) are redundant 1127 stvewx v0, r10, $out 1128 stvewx v0, r11, $out 1129 stvewx v0, r12, $out 1130 b Lschedule_mangle_done 1131 1132 .align 4 1133 Lschedule_mangle_last_dec: 1134 lvx $iptlo, r11, r12 # reload $ipt 1135 lvx $ipthi, r9, r12 1136 addi $out, $out, -16 # add \$-16, %rdx 1137 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0 1138 bl _vpaes_schedule_transform # output transform 1139 1140 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key 1141 addi r9, $out, -15 # -15 is not typo 1142 vperm v0, v0, v0, $outperm # rotate right/left 1143 li r10, 4 1144 vsel v2, $outhead, v0, $outmask 1145 li r11, 8 1146 stvx v2, 0, $out 1147 li r12, 12 1148 stvewx v0, 0, r9 # some (or all) are redundant 1149 stvewx v0, r10, r9 1150 stvewx v0, r11, r9 1151 stvewx v0, r12, r9 1152 1153 1154 Lschedule_mangle_done: 1155 mtlr r7 1156 # cleanup 1157 vxor v0, v0, v0 # vpxor %xmm0, %xmm0, %xmm0 1158 vxor v1, v1, v1 # vpxor %xmm1, %xmm1, %xmm1 1159 vxor v2, v2, v2 # vpxor %xmm2, %xmm2, %xmm2 1160 vxor v3, v3, v3 # vpxor %xmm3, %xmm3, %xmm3 1161 vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4 1162 vxor v5, v5, v5 # vpxor %xmm5, %xmm5, %xmm5 1163 vxor v6, v6, v6 # vpxor %xmm6, %xmm6, %xmm6 1164 vxor v7, v7, v7 # vpxor %xmm7, %xmm7, %xmm7 1165 1166 blr 1167 .long 0 1168 .byte 0,12,0x14,0,0,0,0,0 1169 1170 ## 1171 ## .aes_schedule_192_smear 1172 ## 1173 ## Smear the short, low side in the 192-bit key schedule. 1174 ## 1175 ## Inputs: 1176 ## %xmm7: high side, b a x y 1177 ## %xmm6: low side, d c 0 0 1178 ## %xmm13: 0 1179 ## 1180 ## Outputs: 1181 ## %xmm6: b+c+d b+c 0 0 1182 ## %xmm0: b+c+d b+c b a 1183 ## 1184 .align 4 1185 _vpaes_schedule_192_smear: 1186 ?vspltw v0, v7, 3 1187 ?vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 1188 ?vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a 1189 vxor v6, v6, v1 # vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 1190 vxor v6, v6, v0 # vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a 1191 vmr v0, v6 1192 ?vsldoi v6, v6, v9, 8 1193 ?vsldoi v6, v9, v6, 8 # clobber low side with zeros 1194 blr 1195 .long 0 1196 .byte 0,12,0x14,0,0,0,0,0 1197 1198 ## 1199 ## .aes_schedule_round 1200 ## 1201 ## Runs one main round of the key schedule on %xmm0, %xmm7 1202 ## 1203 ## Specifically, runs subbytes on the high dword of %xmm0 1204 ## then rotates it by one byte and xors into the low dword of 1205 ## %xmm7. 1206 ## 1207 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for 1208 ## next rcon. 1209 ## 1210 ## Smears the dwords of %xmm7 by xoring the low into the 1211 ## second low, result into third, result into highest. 1212 ## 1213 ## Returns results in %xmm7 = %xmm0. 1214 ## Clobbers %xmm1-%xmm4, %r11. 1215 ## 1216 .align 4 1217 _vpaes_schedule_round: 1218 # extract rcon from xmm8 1219 #vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4 1220 ?vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1 1221 ?vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8 1222 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7 1223 1224 # rotate 1225 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0 1226 ?vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0 1227 1228 # fall through... 1229 1230 # low round: same as high round, but no rotation and no rcon. 1231 _vpaes_schedule_low_round: 1232 # smear xmm7 1233 ?vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1 1234 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7 1235 vspltisb v1, 0x0f # 0x0f..0f 1236 ?vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4 1237 1238 # subbytes 1239 vand v1, v1, v0 # vpand %xmm9, %xmm0, %xmm1 # 0 = k 1240 vsrb v0, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i 1241 vxor v7, v7, v4 # vpxor %xmm4, %xmm7, %xmm7 1242 vperm v2, $invhi, v9, v1 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 1243 vxor v1, v1, v0 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j 1244 vperm v3, $invlo, v9, v0 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 1245 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 1246 vperm v4, $invlo, v9, v1 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 1247 vxor v7, v7, v26 # vpxor .Lk_s63(%rip), %xmm7, %xmm7 1248 vperm v3, $invlo, v9, v3 # vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak 1249 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 1250 vperm v2, $invlo, v9, v4 # vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak 1251 vxor v3, v3, v1 # vpxor %xmm1, %xmm3, %xmm3 # 2 = io 1252 vxor v2, v2, v0 # vpxor %xmm0, %xmm2, %xmm2 # 3 = jo 1253 vperm v4, v15, v9, v3 # vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou 1254 vperm v1, v14, v9, v2 # vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t 1255 vxor v1, v1, v4 # vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output 1256 1257 # add in smeared stuff 1258 vxor v0, v1, v7 # vpxor %xmm7, %xmm1, %xmm0 1259 vxor v7, v1, v7 # vmovdqa %xmm0, %xmm7 1260 blr 1261 .long 0 1262 .byte 0,12,0x14,0,0,0,0,0 1263 1264 ## 1265 ## .aes_schedule_transform 1266 ## 1267 ## Linear-transform %xmm0 according to tables at (%r11) 1268 ## 1269 ## Requires that %xmm9 = 0x0F0F... as in preheat 1270 ## Output in %xmm0 1271 ## Clobbers %xmm2 1272 ## 1273 .align 4 1274 _vpaes_schedule_transform: 1275 #vand v1, v0, v9 # vpand %xmm9, %xmm0, %xmm1 1276 vsrb v2, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 1277 # vmovdqa (%r11), %xmm2 # lo 1278 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2 1279 # vmovdqa 16(%r11), %xmm1 # hi 1280 vperm v2, $ipthi, $ipthi, v2 # vpshufb %xmm0, %xmm1, %xmm0 1281 vxor v0, v0, v2 # vpxor %xmm2, %xmm0, %xmm0 1282 blr 1283 .long 0 1284 .byte 0,12,0x14,0,0,0,0,0 1285 1286 ## 1287 ## .aes_schedule_mangle 1288 ## 1289 ## Mangle xmm0 from (basis-transformed) standard version 1290 ## to our version. 1291 ## 1292 ## On encrypt, 1293 ## xor with 0x63 1294 ## multiply by circulant 0,1,1,1 1295 ## apply shiftrows transform 1296 ## 1297 ## On decrypt, 1298 ## xor with 0x63 1299 ## multiply by "inverse mixcolumns" circulant E,B,D,9 1300 ## deskew 1301 ## apply shiftrows transform 1302 ## 1303 ## 1304 ## Writes out to (%rdx), and increments or decrements it 1305 ## Keeps track of round number mod 4 in %r8 1306 ## Preserves xmm0 1307 ## Clobbers xmm1-xmm5 1308 ## 1309 .align 4 1310 _vpaes_schedule_mangle: 1311 #vmr v4, v0 # vmovdqa %xmm0, %xmm4 # save xmm0 for later 1312 # vmovdqa .Lk_mc_forward(%rip),%xmm5 1313 bne $dir, Lschedule_mangle_dec 1314 1315 # encrypting 1316 vxor v4, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm4 1317 addi $out, $out, 16 # add \$16, %rdx 1318 vperm v4, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm4 1319 vperm v1, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm1 1320 vperm v3, v1, v1, v25 # vpshufb %xmm5, %xmm1, %xmm3 1321 vxor v4, v4, v1 # vpxor %xmm1, %xmm4, %xmm4 1322 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1 1323 vxor v3, v3, v4 # vpxor %xmm4, %xmm3, %xmm3 1324 1325 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3 1326 addi r8, r8, -16 # add \$-16, %r8 1327 andi. r8, r8, 0x30 # and \$0x30, %r8 1328 1329 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx) 1330 vperm v1, v3, v3, $outperm # rotate right/left 1331 vsel v2, $outhead, v1, $outmask 1332 vmr $outhead, v1 1333 stvx v2, 0, $out 1334 blr 1335 1336 .align 4 1337 Lschedule_mangle_dec: 1338 # inverse mix columns 1339 # lea .Lk_dksd(%rip),%r11 1340 vsrb v1, v0, v8 # vpsrlb \$4, %xmm4, %xmm1 # 1 = hi 1341 #and v4, v0, v9 # vpand %xmm9, %xmm4, %xmm4 # 4 = lo 1342 1343 # vmovdqa 0x00(%r11), %xmm2 1344 vperm v2, v16, v16, v0 # vpshufb %xmm4, %xmm2, %xmm2 1345 # vmovdqa 0x10(%r11), %xmm3 1346 vperm v3, v17, v17, v1 # vpshufb %xmm1, %xmm3, %xmm3 1347 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 1348 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3 1349 1350 # vmovdqa 0x20(%r11), %xmm2 1351 vperm v2, v18, v18, v0 # vpshufb %xmm4, %xmm2, %xmm2 1352 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2 1353 # vmovdqa 0x30(%r11), %xmm3 1354 vperm v3, v19, v19, v1 # vpshufb %xmm1, %xmm3, %xmm3 1355 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 1356 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3 1357 1358 # vmovdqa 0x40(%r11), %xmm2 1359 vperm v2, v20, v20, v0 # vpshufb %xmm4, %xmm2, %xmm2 1360 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2 1361 # vmovdqa 0x50(%r11), %xmm3 1362 vperm v3, v21, v21, v1 # vpshufb %xmm1, %xmm3, %xmm3 1363 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 1364 1365 # vmovdqa 0x60(%r11), %xmm2 1366 vperm v2, v22, v22, v0 # vpshufb %xmm4, %xmm2, %xmm2 1367 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3 1368 # vmovdqa 0x70(%r11), %xmm4 1369 vperm v4, v23, v23, v1 # vpshufb %xmm1, %xmm4, %xmm4 1370 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1 1371 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2 1372 vxor v3, v4, v2 # vpxor %xmm2, %xmm4, %xmm3 1373 1374 addi $out, $out, -16 # add \$-16, %rdx 1375 1376 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3 1377 addi r8, r8, -16 # add \$-16, %r8 1378 andi. r8, r8, 0x30 # and \$0x30, %r8 1379 1380 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx) 1381 vperm v1, v3, v3, $outperm # rotate right/left 1382 vsel v2, $outhead, v1, $outmask 1383 vmr $outhead, v1 1384 stvx v2, 0, $out 1385 blr 1386 .long 0 1387 .byte 0,12,0x14,0,0,0,0,0 1388 1389 .globl .vpaes_set_encrypt_key 1390 .align 5 1391 .vpaes_set_encrypt_key: 1392 $STU $sp,-$FRAME($sp) 1393 li r10,`15+6*$SIZE_T` 1394 li r11,`31+6*$SIZE_T` 1395 mflr r0 1396 mfspr r6, 256 # save vrsave 1397 stvx v20,r10,$sp 1398 addi r10,r10,32 1399 stvx v21,r11,$sp 1400 addi r11,r11,32 1401 stvx v22,r10,$sp 1402 addi r10,r10,32 1403 stvx v23,r11,$sp 1404 addi r11,r11,32 1405 stvx v24,r10,$sp 1406 addi r10,r10,32 1407 stvx v25,r11,$sp 1408 addi r11,r11,32 1409 stvx v26,r10,$sp 1410 addi r10,r10,32 1411 stvx v27,r11,$sp 1412 addi r11,r11,32 1413 stvx v28,r10,$sp 1414 addi r10,r10,32 1415 stvx v29,r11,$sp 1416 addi r11,r11,32 1417 stvx v30,r10,$sp 1418 stvx v31,r11,$sp 1419 stw r6,`$FRAME-4`($sp) # save vrsave 1420 li r7, -1 1421 $PUSH r0, `$FRAME+$LRSAVE`($sp) 1422 mtspr 256, r7 # preserve all AltiVec registers 1423 1424 srwi r9, $bits, 5 # shr \$5,%eax 1425 addi r9, r9, 6 # add \$5,%eax 1426 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1427 1428 cmplw $dir, $bits, $bits # set encrypt direction 1429 li r8, 0x30 # mov \$0x30,%r8d 1430 bl _vpaes_schedule_core 1431 1432 $POP r0, `$FRAME+$LRSAVE`($sp) 1433 li r10,`15+6*$SIZE_T` 1434 li r11,`31+6*$SIZE_T` 1435 mtspr 256, r6 # restore vrsave 1436 mtlr r0 1437 xor r3, r3, r3 1438 lvx v20,r10,$sp 1439 addi r10,r10,32 1440 lvx v21,r11,$sp 1441 addi r11,r11,32 1442 lvx v22,r10,$sp 1443 addi r10,r10,32 1444 lvx v23,r11,$sp 1445 addi r11,r11,32 1446 lvx v24,r10,$sp 1447 addi r10,r10,32 1448 lvx v25,r11,$sp 1449 addi r11,r11,32 1450 lvx v26,r10,$sp 1451 addi r10,r10,32 1452 lvx v27,r11,$sp 1453 addi r11,r11,32 1454 lvx v28,r10,$sp 1455 addi r10,r10,32 1456 lvx v29,r11,$sp 1457 addi r11,r11,32 1458 lvx v30,r10,$sp 1459 lvx v31,r11,$sp 1460 addi $sp,$sp,$FRAME 1461 blr 1462 .long 0 1463 .byte 0,12,0x04,1,0x80,0,3,0 1464 .long 0 1465 .size .vpaes_set_encrypt_key,.-.vpaes_set_encrypt_key 1466 1467 .globl .vpaes_set_decrypt_key 1468 .align 4 1469 .vpaes_set_decrypt_key: 1470 $STU $sp,-$FRAME($sp) 1471 li r10,`15+6*$SIZE_T` 1472 li r11,`31+6*$SIZE_T` 1473 mflr r0 1474 mfspr r6, 256 # save vrsave 1475 stvx v20,r10,$sp 1476 addi r10,r10,32 1477 stvx v21,r11,$sp 1478 addi r11,r11,32 1479 stvx v22,r10,$sp 1480 addi r10,r10,32 1481 stvx v23,r11,$sp 1482 addi r11,r11,32 1483 stvx v24,r10,$sp 1484 addi r10,r10,32 1485 stvx v25,r11,$sp 1486 addi r11,r11,32 1487 stvx v26,r10,$sp 1488 addi r10,r10,32 1489 stvx v27,r11,$sp 1490 addi r11,r11,32 1491 stvx v28,r10,$sp 1492 addi r10,r10,32 1493 stvx v29,r11,$sp 1494 addi r11,r11,32 1495 stvx v30,r10,$sp 1496 stvx v31,r11,$sp 1497 stw r6,`$FRAME-4`($sp) # save vrsave 1498 li r7, -1 1499 $PUSH r0, `$FRAME+$LRSAVE`($sp) 1500 mtspr 256, r7 # preserve all AltiVec registers 1501 1502 srwi r9, $bits, 5 # shr \$5,%eax 1503 addi r9, r9, 6 # add \$5,%eax 1504 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1505 1506 slwi r9, r9, 4 # shl \$4,%eax 1507 add $out, $out, r9 # lea (%rdx,%rax),%rdx 1508 1509 cmplwi $dir, $bits, 0 # set decrypt direction 1510 srwi r8, $bits, 1 # shr \$1,%r8d 1511 andi. r8, r8, 32 # and \$32,%r8d 1512 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32 1513 bl _vpaes_schedule_core 1514 1515 $POP r0, `$FRAME+$LRSAVE`($sp) 1516 li r10,`15+6*$SIZE_T` 1517 li r11,`31+6*$SIZE_T` 1518 mtspr 256, r6 # restore vrsave 1519 mtlr r0 1520 xor r3, r3, r3 1521 lvx v20,r10,$sp 1522 addi r10,r10,32 1523 lvx v21,r11,$sp 1524 addi r11,r11,32 1525 lvx v22,r10,$sp 1526 addi r10,r10,32 1527 lvx v23,r11,$sp 1528 addi r11,r11,32 1529 lvx v24,r10,$sp 1530 addi r10,r10,32 1531 lvx v25,r11,$sp 1532 addi r11,r11,32 1533 lvx v26,r10,$sp 1534 addi r10,r10,32 1535 lvx v27,r11,$sp 1536 addi r11,r11,32 1537 lvx v28,r10,$sp 1538 addi r10,r10,32 1539 lvx v29,r11,$sp 1540 addi r11,r11,32 1541 lvx v30,r10,$sp 1542 lvx v31,r11,$sp 1543 addi $sp,$sp,$FRAME 1544 blr 1545 .long 0 1546 .byte 0,12,0x04,1,0x80,0,3,0 1547 .long 0 1548 .size .vpaes_set_decrypt_key,.-.vpaes_set_decrypt_key 1549 ___ 1550 } 1551 1552 my $consts=1; 1553 foreach (split("\n",$code)) { 1554 s/\`([^\`]*)\`/eval $1/geo; 1555 1556 # constants table endian-specific conversion 1557 if ($consts && m/\.long\s+(.+)\s+(\?[a-z]*)$/o) { 1558 my $conv=$2; 1559 my @bytes=(); 1560 1561 # convert to endian-agnostic format 1562 foreach (split(/,\s+/,$1)) { 1563 my $l = /^0/?oct:int; 1564 push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff; 1565 } 1566 1567 # little-endian conversion 1568 if ($flavour =~ /le$/o) { 1569 SWITCH: for($conv) { 1570 /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; }; 1571 /\?rev/ && do { @bytes=reverse(@bytes); last; }; 1572 } 1573 } 1574 1575 #emit 1576 print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n"; 1577 next; 1578 } 1579 $consts=0 if (m/Lconsts:/o); # end of table 1580 1581 # instructions prefixed with '?' are endian-specific and need 1582 # to be adjusted accordingly... 1583 if ($flavour =~ /le$/o) { # little-endian 1584 s/\?lvsr/lvsl/o or 1585 s/\?lvsl/lvsr/o or 1586 s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or 1587 s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or 1588 s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o; 1589 } else { # big-endian 1590 s/\?([a-z]+)/$1/o; 1591 } 1592 1593 print $_,"\n"; 1594 } 1595 1596 close STDOUT or die "error closing STDOUT: $!"; 1597