Home | History | Annotate | Line # | Download | only in asm
      1 #! /usr/bin/env perl
      2 # This file is dual-licensed, meaning that you can use it under your
      3 # choice of either of the following two licenses:
      4 #
      5 # Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
      6 #
      7 # Licensed under the Apache License 2.0 (the "License"). You can obtain
      8 # a copy in the file LICENSE in the source distribution or at
      9 # https://www.openssl.org/source/license.html
     10 #
     11 # or
     12 #
     13 # Copyright (c) 2023, Jerry Shih <jerry.shih (at] sifive.com>
     14 # All rights reserved.
     15 #
     16 # Redistribution and use in source and binary forms, with or without
     17 # modification, are permitted provided that the following conditions
     18 # are met:
     19 # 1. Redistributions of source code must retain the above copyright
     20 #    notice, this list of conditions and the following disclaimer.
     21 # 2. Redistributions in binary form must reproduce the above copyright
     22 #    notice, this list of conditions and the following disclaimer in the
     23 #    documentation and/or other materials provided with the distribution.
     24 #
     25 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     26 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     27 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     28 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     29 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     30 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     31 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     32 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     33 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     34 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     35 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     36 
     37 # - RV64I
     38 # - RISC-V Vector ('V') with VLEN >= 128
     39 # - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
     40 # - RISC-V Vector GCM/GMAC extension ('Zvkg')
     41 # - RISC-V Vector AES Block Cipher extension ('Zvkned')
     42 # - RISC-V Zicclsm(Main memory supports misaligned loads/stores)
     43 
     44 # Reference: https://github.com/riscv/riscv-crypto/issues/192#issuecomment-1270447575
     45 #
     46 # Assume we have 12 GCM blocks and we try to parallelize GCM computation for 4 blocks.
     47 # Tag = M0*H^12 + M1*H^11 + M2*H^10 + M3*H^9 +
     48 #       M4*H^8  + M5*H^7  + M6*H^6  + M7*H^5 +
     49 #       M8*H^4  + M9*H^3  + M10*H^2 + M11*H^1
     50 # We could rewrite the formula into:
     51 # T0 = 0
     52 # T1 = (T0+M1)*H^4   T2 = (T0+M2)*H^4    T3 = (T0+M3)*H^4    T4 = (T0+M4)*H^4
     53 # T5 = (T1+M5)*H^4   T6 = (T2+M6)*H^4    T7 = (T3+M7)*H^4    T8 = (T4+M8)*H^4
     54 # T9 = (T5+M9)*H^4  T10 = (T6+M10)*H^3  T11 = (T7+M11)*H^2  T12 = (T8+M12)*H^1
     55 #
     56 # We will multiply with [H^4, H^4, H^4, H^4] in each steps except the last iteration.
     57 # The last iteration will multiply with [H^4, H^3, H^2, H^1].
     58 
     59 use strict;
     60 use warnings;
     61 
     62 use FindBin qw($Bin);
     63 use lib "$Bin";
     64 use lib "$Bin/../../perlasm";
     65 use riscv;
     66 
     67 # $output is the last argument if it looks like a file (it has an extension)
     68 # $flavour is the first argument if it doesn't look like a file
     69 my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
     70 my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
     71 
     72 $output and open STDOUT,">$output";
     73 
     74 my $code=<<___;
     75 .text
     76 ___
     77 
     78 {
     79 my ($INP, $OUTP, $LEN, $KEYP, $IVP, $XIP) = ("a0", "a1", "a2", "a3", "a4", "a5");
     80 my ($T0, $T1, $T2, $T3) = ("t0", "t1", "t2", "t3");
     81 my ($PADDING_LEN32) = ("t4");
     82 my ($LEN32) = ("t5");
     83 my ($CTR) = ("t6");
     84 my ($FULL_BLOCK_LEN32) = ("a6");
     85 my ($ORIGINAL_LEN32) = ("a7");
     86 my ($PROCESSED_LEN) = ("a0");
     87 my ($CTR_MASK) = ("v0");
     88 my ($INPUT_PADDING_MASK) = ("v0");
     89 my ($V0, $V1, $V2, $V3, $V4, $V5, $V6, $V7,
     90     $V8, $V9, $V10, $V11, $V12, $V13, $V14, $V15,
     91     $V16, $V17, $V18, $V19, $V20, $V21, $V22, $V23,
     92     $V24, $V25, $V26, $V27, $V28, $V29, $V30, $V31,
     93 ) = map("v$_",(0..31));
     94 
     95 # Do aes-128 enc.
     96 sub aes_128_cipher_body {
     97     my $code=<<___;
     98     @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
     99     @{[vaesz_vs $V28, $V1]}
    100     @{[vaesem_vs $V28, $V2]}
    101     @{[vaesem_vs $V28, $V3]}
    102     @{[vaesem_vs $V28, $V4]}
    103     @{[vaesem_vs $V28, $V5]}
    104     @{[vaesem_vs $V28, $V6]}
    105     @{[vaesem_vs $V28, $V7]}
    106     @{[vaesem_vs $V28, $V8]}
    107     @{[vaesem_vs $V28, $V9]}
    108     @{[vaesem_vs $V28, $V10]}
    109     @{[vaesef_vs $V28, $V11]}
    110 ___
    111 
    112     return $code;
    113 }
    114 
    115 # Do aes-192 enc.
    116 sub aes_192_cipher_body {
    117     my $TMP_REG = shift;
    118 
    119     my $code=<<___;
    120     # Load key 4
    121     @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
    122     addi $TMP_REG, $KEYP, 48
    123     @{[vle32_v $V11, $TMP_REG]}
    124     @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
    125     @{[vaesz_vs $V28, $V1]}
    126     @{[vaesem_vs $V28, $V2]}
    127     @{[vaesem_vs $V28, $V3]}
    128     @{[vaesem_vs $V28, $V11]}
    129     # Load key 8
    130     @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
    131     addi $TMP_REG, $KEYP, 112
    132     @{[vle32_v $V11, $TMP_REG]}
    133     @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
    134     @{[vaesem_vs $V28, $V4]}
    135     @{[vaesem_vs $V28, $V5]}
    136     @{[vaesem_vs $V28, $V6]}
    137     @{[vaesem_vs $V28, $V11]}
    138     # Load key 13
    139     @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
    140     addi $TMP_REG, $KEYP, 192
    141     @{[vle32_v $V11, $TMP_REG]}
    142     @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
    143     @{[vaesem_vs $V28, $V7]}
    144     @{[vaesem_vs $V28, $V8]}
    145     @{[vaesem_vs $V28, $V9]}
    146     @{[vaesem_vs $V28, $V10]}
    147     @{[vaesef_vs $V28, $V11]}
    148 ___
    149 
    150     return $code;
    151 }
    152 
    153 # Do aes-256 enc.
    154 sub aes_256_cipher_body {
    155     my $TMP_REG = shift;
    156 
    157     my $code=<<___;
    158     # Load key 3
    159     @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
    160     addi $TMP_REG, $KEYP, 32
    161     @{[vle32_v $V11, $TMP_REG]}
    162     @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
    163     @{[vaesz_vs $V28, $V1]}
    164     @{[vaesem_vs $V28, $V2]}
    165     @{[vaesem_vs $V28, $V11]}
    166     # Load key 6
    167     @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
    168     addi $TMP_REG, $KEYP, 80
    169     @{[vle32_v $V11, $TMP_REG]}
    170     @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
    171     @{[vaesem_vs $V28, $V3]}
    172     @{[vaesem_vs $V28, $V4]}
    173     @{[vaesem_vs $V28, $V11]}
    174     # Load key 9
    175     @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
    176     addi $TMP_REG, $KEYP, 128
    177     @{[vle32_v $V11, $TMP_REG]}
    178     @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
    179     @{[vaesem_vs $V28, $V5]}
    180     @{[vaesem_vs $V28, $V6]}
    181     @{[vaesem_vs $V28, $V11]}
    182     # Load key 12
    183     @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
    184     addi $TMP_REG, $KEYP, 176
    185     @{[vle32_v $V11, $TMP_REG]}
    186     @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
    187     @{[vaesem_vs $V28, $V7]}
    188     @{[vaesem_vs $V28, $V8]}
    189     @{[vaesem_vs $V28, $V11]}
    190     # Load key 15
    191     @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
    192     addi $TMP_REG, $KEYP, 224
    193     @{[vle32_v $V11, $TMP_REG]}
    194     @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
    195     @{[vaesem_vs $V28, $V9]}
    196     @{[vaesem_vs $V28, $V10]}
    197     @{[vaesef_vs $V28, $V11]}
    198 ___
    199 
    200     return $code;
    201 }
    202 
    203 sub handle_padding_in_first_round {
    204     my $TMP_REG = shift;
    205 
    206     my $code=<<___;
    207     bnez $PADDING_LEN32, 1f
    208 
    209     ## without padding
    210     # Store ciphertext/plaintext
    211     @{[vse32_v $V28, $OUTP]}
    212     j 2f
    213 
    214     ## with padding
    215 1:
    216     # Store ciphertext/plaintext using mask
    217     @{[vse32_v $V28, $OUTP, $INPUT_PADDING_MASK]}
    218 
    219     # Fill zero for the padding blocks
    220     @{[vsetvli "zero", $PADDING_LEN32, "e32", "m4", "tu", "ma"]}
    221     @{[vmv_v_i $V28, 0]}
    222 
    223     # We have used mask register for `INPUT_PADDING_MASK` before. We need to
    224     # setup the ctr mask back.
    225     # ctr mask : [000100010001....]
    226     @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e8", "m1", "ta", "ma"]}
    227     li $TMP_REG, 0b10001000
    228     @{[vmv_v_x $CTR_MASK, $TMP_REG]}
    229 2:
    230 
    231 ___
    232 
    233     return $code;
    234 }
    235 
    236 
    237 # Do aes-128 enc for first round.
    238 sub aes_128_first_round {
    239     my $PTR_OFFSET_REG = shift;
    240     my $TMP_REG = shift;
    241 
    242     my $code=<<___;
    243     # Load all 11 aes round keys to v1-v11 registers.
    244     @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
    245     @{[vle32_v $V1, $KEYP]}
    246     addi $KEYP, $KEYP, 16
    247     @{[vle32_v $V2, $KEYP]}
    248     addi $KEYP, $KEYP, 16
    249     @{[vle32_v $V3, $KEYP]}
    250     addi $KEYP, $KEYP, 16
    251     @{[vle32_v $V4, $KEYP]}
    252     addi $KEYP, $KEYP, 16
    253     @{[vle32_v $V5, $KEYP]}
    254     addi $KEYP, $KEYP, 16
    255     @{[vle32_v $V6, $KEYP]}
    256     addi $KEYP, $KEYP, 16
    257     @{[vle32_v $V7, $KEYP]}
    258     addi $KEYP, $KEYP, 16
    259     @{[vle32_v $V8, $KEYP]}
    260     addi $KEYP, $KEYP, 16
    261     @{[vle32_v $V9, $KEYP]}
    262     addi $KEYP, $KEYP, 16
    263     @{[vle32_v $V10, $KEYP]}
    264     addi $KEYP, $KEYP, 16
    265     @{[vle32_v $V11, $KEYP]}
    266 
    267     # We already have the ciphertext/plaintext and ctr data for the first round.
    268     @{[aes_128_cipher_body]}
    269 
    270     # Compute AES ctr result.
    271     @{[vxor_vv $V28, $V28, $V24]}
    272 
    273     @{[handle_padding_in_first_round $TMP_REG]}
    274 
    275     add $INP, $INP, $PTR_OFFSET_REG
    276     add $OUTP, $OUTP, $PTR_OFFSET_REG
    277 ___
    278 
    279     return $code;
    280 }
    281 
    282 # Do aes-192 enc for first round.
    283 sub aes_192_first_round {
    284     my $PTR_OFFSET_REG = shift;
    285     my $TMP_REG = shift;
    286 
    287     my $code=<<___;
    288     # We run out of 32 vector registers, so we just preserve some round keys
    289     # and load the remaining round keys inside the aes body.
    290     # We keep the round keys for:
    291     #   1, 2, 3, 5, 6, 7, 9, 10, 11 and 12th keys.
    292     # The following keys will be loaded in the aes body:
    293     #   4, 8 and 13th keys.
    294     @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
    295     # key 1
    296     @{[vle32_v $V1, $KEYP]}
    297     # key 2
    298     addi $TMP_REG, $KEYP, 16
    299     @{[vle32_v $V2, $TMP_REG]}
    300     # key 3
    301     addi $TMP_REG, $KEYP, 32
    302     @{[vle32_v $V3, $TMP_REG]}
    303     # key 5
    304     addi $TMP_REG, $KEYP, 64
    305     @{[vle32_v $V4, $TMP_REG]}
    306     # key 6
    307     addi $TMP_REG, $KEYP, 80
    308     @{[vle32_v $V5, $TMP_REG]}
    309     # key 7
    310     addi $TMP_REG, $KEYP, 96
    311     @{[vle32_v $V6, $TMP_REG]}
    312     # key 9
    313     addi $TMP_REG, $KEYP, 128
    314     @{[vle32_v $V7, $TMP_REG]}
    315     # key 10
    316     addi $TMP_REG, $KEYP, 144
    317     @{[vle32_v $V8, $TMP_REG]}
    318     # key 11
    319     addi $TMP_REG, $KEYP, 160
    320     @{[vle32_v $V9, $TMP_REG]}
    321     # key 12
    322     addi $TMP_REG, $KEYP, 176
    323     @{[vle32_v $V10, $TMP_REG]}
    324 
    325     # We already have the ciphertext/plaintext and ctr data for the first round.
    326     @{[aes_192_cipher_body $TMP_REG]}
    327 
    328     # Compute AES ctr result.
    329     @{[vxor_vv $V28, $V28, $V24]}
    330 
    331     @{[handle_padding_in_first_round $TMP_REG]}
    332 
    333     add $INP, $INP, $PTR_OFFSET_REG
    334     add $OUTP, $OUTP, $PTR_OFFSET_REG
    335 ___
    336 
    337     return $code;
    338 }
    339 
    340 # Do aes-256 enc for first round.
    341 sub aes_256_first_round {
    342     my $PTR_OFFSET_REG = shift;
    343     my $TMP_REG = shift;
    344 
    345     my $code=<<___;
    346     # We run out of 32 vector registers, so we just preserve some round keys
    347     # and load the remaining round keys inside the aes body.
    348     # We keep the round keys for:
    349     #   1, 2, 4, 5, 7, 8, 10, 11, 13 and 14th keys.
    350     # The following keys will be loaded in the aes body:
    351     #   3, 6, 9, 12 and 15th keys.
    352     @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
    353     # key 1
    354     @{[vle32_v $V1, $KEYP]}
    355     # key 2
    356     addi $TMP_REG, $KEYP, 16
    357     @{[vle32_v $V2, $TMP_REG]}
    358     # key 4
    359     addi $TMP_REG, $KEYP, 48
    360     @{[vle32_v $V3, $TMP_REG]}
    361     # key 5
    362     addi $TMP_REG, $KEYP, 64
    363     @{[vle32_v $V4, $TMP_REG]}
    364     # key 7
    365     addi $TMP_REG, $KEYP, 96
    366     @{[vle32_v $V5, $TMP_REG]}
    367     # key 8
    368     addi $TMP_REG, $KEYP, 112
    369     @{[vle32_v $V6, $TMP_REG]}
    370     # key 10
    371     addi $TMP_REG, $KEYP, 144
    372     @{[vle32_v $V7, $TMP_REG]}
    373     # key 11
    374     addi $TMP_REG, $KEYP, 160
    375     @{[vle32_v $V8, $TMP_REG]}
    376     # key 13
    377     addi $TMP_REG, $KEYP, 192
    378     @{[vle32_v $V9, $TMP_REG]}
    379     # key 14
    380     addi $TMP_REG, $KEYP, 208
    381     @{[vle32_v $V10, $TMP_REG]}
    382 
    383     # We already have the ciphertext/plaintext and ctr data for the first round.
    384     @{[aes_256_cipher_body $TMP_REG]}
    385 
    386     # Compute AES ctr result.
    387     @{[vxor_vv $V28, $V28, $V24]}
    388 
    389     @{[handle_padding_in_first_round $TMP_REG]}
    390 
    391     add $INP, $INP, $PTR_OFFSET_REG
    392     add $OUTP, $OUTP, $PTR_OFFSET_REG
    393 ___
    394 
    395     return $code;
    396 }
    397 
    398 sub aes_gcm_init {
    399     my $code=<<___;
    400     # Compute the AES-GCM full-block e32 length for `LMUL=4`. We will handle
    401     # the multiple AES-GCM blocks at the same time within `LMUL=4` register.
    402     # The AES-GCM's SEW is e32 and EGW is 128 bits.
    403     #   FULL_BLOCK_LEN32 = (VLEN*LMUL)/(EGW) * (EGW/SEW) = (VLEN*4)/(32*4) * 4
    404     #                    = (VLEN*4)/32
    405     # We could get the block_num using the VL value of `vsetvli with e32, m4`.
    406     @{[vsetvli $FULL_BLOCK_LEN32, "zero", "e32", "m4", "ta", "ma"]}
    407     # If `LEN32 % FULL_BLOCK_LEN32` is not equal to zero, we could fill the
    408     # zero padding data to make sure we could always handle FULL_BLOCK_LEN32
    409     # blocks for all iterations.
    410 
    411     ## Prepare the H^n multiplier in v16 for GCM multiplier. The `n` is the gcm
    412     ## block number in a LMUL=4 register group.
    413     ##   n = ((VLEN*LMUL)/(32*4)) = ((VLEN*4)/(32*4))
    414     ##     = (VLEN/32)
    415     ## We could use vsetvli with `e32, m1` to compute the `n` number.
    416     @{[vsetvli $T0, "zero", "e32", "m1", "ta", "ma"]}
    417 
    418     # The H is at `gcm128_context.Htable[0]`(addr(Xi)+16*2).
    419     addi $T1, $XIP, 32
    420     @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
    421     @{[vle32_v $V31, $T1]}
    422 
    423     # Compute the H^n
    424     li $T1, 1
    425 1:
    426     @{[vgmul_vv $V31, $V31]}
    427     slli $T1, $T1, 1
    428     bltu $T1, $T0, 1b
    429 
    430     @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
    431     @{[vmv_v_i $V16, 0]}
    432     @{[vaesz_vs $V16, $V31]}
    433 
    434     #### Load plaintext into v24 and handle padding. We also load the init tag
    435     #### data into v20 and prepare the AES ctr input data into v12 and v28.
    436     @{[vmv_v_i $V20, 0]}
    437 
    438     ## Prepare the AES ctr input data into v12.
    439     # Setup ctr input mask.
    440     # ctr mask : [000100010001....]
    441     # Note: The actual vl should be `FULL_BLOCK_LEN32/4 * 2`, but we just use
    442     #   `FULL_BLOCK_LEN32` here.
    443     @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e8", "m1", "ta", "ma"]}
    444     li $T0, 0b10001000
    445     @{[vmv_v_x $CTR_MASK, $T0]}
    446     # Load IV.
    447     @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
    448     @{[vle32_v $V31, $IVP]}
    449     # Convert the big-endian counter into little-endian.
    450     @{[vsetivli "zero", 4, "e32", "m1", "ta", "mu"]}
    451     @{[vrev8_v $V31, $V31, $CTR_MASK]}
    452     # Splat the `single block of IV` to v12
    453     @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
    454     @{[vmv_v_i $V12, 0]}
    455     @{[vaesz_vs $V12, $V31]}
    456     # Prepare the ctr counter into v8
    457     # v8: [x, x, x, 0, x, x, x, 1, x, x, x, 2, ...]
    458     @{[viota_m $V8, $CTR_MASK, $CTR_MASK]}
    459     # Merge IV and ctr counter into v12.
    460     # v12:[x, x, x, count+0, x, x, x, count+1, ...]
    461     @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "mu"]}
    462     @{[vadd_vv $V12, $V12, $V8, $CTR_MASK]}
    463 
    464     li $PADDING_LEN32, 0
    465     # Get the SEW32 size in the first round.
    466     # If we have the non-zero value for `LEN32&(FULL_BLOCK_LEN32-1)`, then
    467     # we will have the leading padding zero.
    468     addi $T0, $FULL_BLOCK_LEN32, -1
    469     and $T0, $T0, $LEN32
    470     beqz $T0, 1f
    471 
    472     ## with padding
    473     sub $LEN32, $LEN32, $T0
    474     sub $PADDING_LEN32, $FULL_BLOCK_LEN32, $T0
    475     # padding block size
    476     srli $T1, $PADDING_LEN32, 2
    477     # padding byte size
    478     slli $T2, $PADDING_LEN32, 2
    479 
    480     # Adjust the ctr counter to make the counter start from `counter+0` for the
    481     # first non-padding block.
    482     @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "mu"]}
    483     @{[vsub_vx $V12, $V12, $T1, $CTR_MASK]}
    484     # Prepare the AES ctr input into v28.
    485     # The ctr data uses big-endian form.
    486     @{[vmv_v_v $V28, $V12]}
    487     @{[vrev8_v $V28, $V28, $CTR_MASK]}
    488 
    489     # Prepare the mask for input loading in the first round. We use
    490     # `VL=FULL_BLOCK_LEN32` with the mask in the first round.
    491     # Adjust input ptr.
    492     sub $INP, $INP, $T2
    493     # Adjust output ptr.
    494     sub $OUTP, $OUTP, $T2
    495     @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e16", "m2", "ta", "ma"]}
    496     @{[vid_v $V2]}
    497     # We don't use the pseudo instruction `vmsgeu` here. Use `vmsgtu` instead.
    498     # The original code is:
    499     #   vmsgeu.vx $INPUT_PADDING_MASK, $V2, $PADDING_LEN32
    500     addi $T0, $PADDING_LEN32, -1
    501     @{[vmsgtu_vx $INPUT_PADDING_MASK, $V2, $T0]}
    502     @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
    503     @{[vmv_v_i $V24, 0]}
    504     # Load the input for length FULL_BLOCK_LEN32 with mask.
    505     @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "mu"]}
    506     @{[vle32_v $V24, $INP, $INPUT_PADDING_MASK]}
    507 
    508     # Load the init `Xi` data to v20 with preceding zero padding.
    509     # Adjust Xi ptr.
    510     sub $T0, $XIP, $T2
    511     # Load for length `zero-padding-e32-length + 4`.
    512     addi $T1, $PADDING_LEN32, 4
    513     @{[vsetvli "zero", $T1, "e32", "m4", "tu", "mu"]}
    514     @{[vle32_v $V20, $T0, $INPUT_PADDING_MASK]}
    515     j 2f
    516 
    517 1:
    518     ## without padding
    519     sub $LEN32, $LEN32, $FULL_BLOCK_LEN32
    520 
    521     @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
    522     @{[vle32_v $V24, $INP]}
    523 
    524     # Load the init Xi data to v20.
    525     @{[vsetivli "zero", 4, "e32", "m1", "tu", "ma"]}
    526     @{[vle32_v $V20, $XIP]}
    527 
    528     # Prepare the AES ctr input into v28.
    529     # The ctr data uses big-endian form.
    530     @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "mu"]}
    531     @{[vmv_v_v $V28, $V12]}
    532     @{[vrev8_v $V28, $V28, $CTR_MASK]}
    533 2:
    534 ___
    535 
    536     return $code;
    537 }
    538 
    539 sub prepare_input_and_ctr {
    540     my $PTR_OFFSET_REG = shift;
    541 
    542     my $code=<<___;
    543     @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "mu"]}
    544     # Increase ctr in v12.
    545     @{[vadd_vx $V12, $V12, $CTR, $CTR_MASK]}
    546     sub $LEN32, $LEN32, $FULL_BLOCK_LEN32
    547     # Load plaintext into v24
    548     @{[vsetvli "zero", "zero", "e32", "m4", "ta", "ma"]}
    549     @{[vle32_v $V24, $INP]}
    550     # Prepare the AES ctr input into v28.
    551     # The ctr data uses big-endian form.
    552     @{[vmv_v_v $V28, $V12]}
    553     add $INP, $INP, $PTR_OFFSET_REG
    554     @{[vsetvli "zero", "zero", "e32", "m4", "ta", "mu"]}
    555     @{[vrev8_v $V28, $V28, $CTR_MASK]}
    556 ___
    557 
    558     return $code;
    559 }
    560 
    561 # Store the current CTR back to IV buffer.
    562 sub store_current_ctr {
    563     my $code=<<___;
    564     @{[vsetivli "zero", 4, "e32", "m4", "ta", "ma"]}
    565     # Update current ctr value to v12
    566     @{[vadd_vx $V12, $V12, $CTR, $CTR_MASK]}
    567     # Convert ctr to big-endian counter.
    568     @{[vrev8_v $V12, $V12, $CTR_MASK]}
    569     @{[vse32_v $V12, $IVP, $CTR_MASK]}
    570 ___
    571 
    572     return $code;
    573 }
    574 
    575 # Compute the final tag into v0 from the partial tag v20.
    576 sub compute_final_tag {
    577     my $TMP_REG = shift;
    578 
    579     my $code=<<___;
    580     # The H is at `gcm128_context.Htable[0]` (addr(Xi)+16*2).
    581     # Load H to v1
    582     addi $TMP_REG, $XIP, 32
    583     @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
    584     @{[vle32_v $V1, $TMP_REG]}
    585     # Multiply H for each partial tag and XOR them together.
    586     # Handle 1st partial tag
    587     @{[vmv_v_v $V0, $V20]}
    588     @{[vgmul_vv $V0, $V1]}
    589     # Handle 2nd to N-th partial tags
    590     li $TMP_REG, 4
    591 1:
    592     @{[vsetivli "zero", 4, "e32", "m4", "ta", "ma"]}
    593     @{[vslidedown_vx $V4, $V20, $TMP_REG]}
    594     @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
    595     @{[vghsh_vv $V0, $V1, $V4]}
    596     addi $TMP_REG, $TMP_REG, 4
    597     blt $TMP_REG, $FULL_BLOCK_LEN32, 1b
    598 ___
    599 
    600     return $code;
    601 }
    602 
    603 ################################################################################
    604 # size_t rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt(const unsigned char *in,
    605 #                                               unsigned char *out, size_t len,
    606 #                                               const void *key,
    607 #                                               unsigned char ivec[16], u64 *Xi);
    608 {
    609 $code .= <<___;
    610 .p2align 3
    611 .globl rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt
    612 .type rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt,\@function
    613 rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt:
    614     srli $T0, $LEN, 4
    615     beqz $T0, .Lenc_end
    616     slli $LEN32, $T0, 2
    617 
    618     mv $ORIGINAL_LEN32, $LEN32
    619 
    620     @{[aes_gcm_init]}
    621 
    622     # Load number of rounds
    623     lwu $T0, 240($KEYP)
    624     li $T1, 14
    625     li $T2, 12
    626     li $T3, 10
    627 
    628     beq $T0, $T1, aes_gcm_enc_blocks_256
    629     beq $T0, $T2, aes_gcm_enc_blocks_192
    630     beq $T0, $T3, aes_gcm_enc_blocks_128
    631 
    632 .Lenc_end:
    633     li $PROCESSED_LEN, 0
    634     ret
    635 
    636 .size rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt,.-rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt
    637 ___
    638 
    639 $code .= <<___;
    640 .p2align 3
    641 aes_gcm_enc_blocks_128:
    642     srli $CTR, $FULL_BLOCK_LEN32, 2
    643     slli $T0, $FULL_BLOCK_LEN32, 2
    644 
    645     @{[aes_128_first_round $T0, $T1]}
    646 
    647     @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
    648 
    649 .Lenc_blocks_128:
    650     # Compute the partial tags.
    651     # The partial tags will multiply with [H^n, H^n, ..., H^n]
    652     #   [tag0, tag1, ...] =
    653     #     ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n]
    654     # We will skip the [H^n, H^n, ..., H^n] multiplication for the last round.
    655     beqz $LEN32, .Lenc_blocks_128_end
    656     @{[vghsh_vv $V20, $V16, $V28]}
    657 
    658     @{[prepare_input_and_ctr $T0]}
    659 
    660     @{[aes_128_cipher_body]}
    661 
    662     # Compute AES ctr ciphertext result.
    663     @{[vxor_vv $V28, $V28, $V24]}
    664 
    665     # Store ciphertext
    666     @{[vse32_v $V28, $OUTP]}
    667     add $OUTP, $OUTP, $T0
    668 
    669     j .Lenc_blocks_128
    670 .Lenc_blocks_128_end:
    671 
    672     # Add ciphertext into partial tag
    673     @{[vxor_vv $V20, $V20, $V28]}
    674 
    675     @{[store_current_ctr]}
    676 
    677     @{[compute_final_tag $T1]}
    678 
    679     # Save the final tag
    680     @{[vse32_v $V0, $XIP]}
    681 
    682     # return the processed size.
    683     slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2
    684     ret
    685 .size aes_gcm_enc_blocks_128,.-aes_gcm_enc_blocks_128
    686 ___
    687 
    688 $code .= <<___;
    689 .p2align 3
    690 aes_gcm_enc_blocks_192:
    691     srli $CTR, $FULL_BLOCK_LEN32, 2
    692     slli $T0, $FULL_BLOCK_LEN32, 2
    693 
    694     @{[aes_192_first_round $T0, $T1]}
    695 
    696     @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
    697 
    698 .Lenc_blocks_192:
    699     # Compute the partial tags.
    700     # The partial tags will multiply with [H^n, H^n, ..., H^n]
    701     #   [tag0, tag1, ...] =
    702     #     ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n]
    703     # We will skip the [H^n, H^n, ..., H^n] multiplication for the last round.
    704     beqz $LEN32, .Lenc_blocks_192_end
    705     @{[vghsh_vv $V20, $V16, $V28]}
    706 
    707     @{[prepare_input_and_ctr $T0]}
    708 
    709     @{[aes_192_cipher_body $T1]}
    710 
    711     # Compute AES ctr ciphertext result.
    712     @{[vxor_vv $V28, $V28, $V24]}
    713 
    714     # Store ciphertext
    715     @{[vse32_v $V28, $OUTP]}
    716     add $OUTP, $OUTP, $T0
    717 
    718     j .Lenc_blocks_192
    719 .Lenc_blocks_192_end:
    720 
    721     # Add ciphertext into partial tag
    722     @{[vxor_vv $V20, $V20, $V28]}
    723 
    724     @{[store_current_ctr]}
    725 
    726     @{[compute_final_tag $T1]}
    727 
    728     # Save the final tag
    729     @{[vse32_v $V0, $XIP]}
    730 
    731     # return the processed size.
    732     slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2
    733     ret
    734 .size aes_gcm_enc_blocks_192,.-aes_gcm_enc_blocks_192
    735 ___
    736 
    737 $code .= <<___;
    738 .p2align 3
    739 aes_gcm_enc_blocks_256:
    740     srli $CTR, $FULL_BLOCK_LEN32, 2
    741     slli $T0, $FULL_BLOCK_LEN32, 2
    742 
    743     @{[aes_256_first_round $T0, $T1]}
    744 
    745     @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
    746 
    747 .Lenc_blocks_256:
    748     # Compute the partial tags.
    749     # The partial tags will multiply with [H^n, H^n, ..., H^n]
    750     #   [tag0, tag1, ...] =
    751     #     ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n]
    752     # We will skip the [H^n, H^n, ..., H^n] multiplication for the last round.
    753     beqz $LEN32, .Lenc_blocks_256_end
    754     @{[vghsh_vv $V20, $V16, $V28]}
    755 
    756     @{[prepare_input_and_ctr $T0]}
    757 
    758     @{[aes_256_cipher_body $T1]}
    759 
    760     # Compute AES ctr ciphertext result.
    761     @{[vxor_vv $V28, $V28, $V24]}
    762 
    763     # Store ciphertext
    764     @{[vse32_v $V28, $OUTP]}
    765     add $OUTP, $OUTP, $T0
    766 
    767     j .Lenc_blocks_256
    768 .Lenc_blocks_256_end:
    769 
    770     # Add ciphertext into partial tag
    771     @{[vxor_vv $V20, $V20, $V28]}
    772 
    773     @{[store_current_ctr]}
    774 
    775     @{[compute_final_tag $T1]}
    776 
    777     # Save the final tag
    778     @{[vse32_v $V0, $XIP]}
    779 
    780     # return the processed size.
    781     slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2
    782     ret
    783 .size aes_gcm_enc_blocks_256,.-aes_gcm_enc_blocks_256
    784 ___
    785 
    786 }
    787 
    788 ################################################################################
    789 # size_t rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt(const unsigned char *in,
    790 #                                               unsigned char *out, size_t len,
    791 #                                               const void *key,
    792 #                                               unsigned char ivec[16], u64 *Xi);
    793 {
    794 $code .= <<___;
    795 .p2align 3
    796 .globl rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt
    797 .type rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt,\@function
    798 rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt:
    799     srli $T0, $LEN, 4
    800     beqz $T0, .Ldec_end
    801     slli $LEN32, $T0, 2
    802 
    803     mv $ORIGINAL_LEN32, $LEN32
    804 
    805     @{[aes_gcm_init]}
    806 
    807     # Load number of rounds
    808     lwu $T0, 240($KEYP)
    809     li $T1, 14
    810     li $T2, 12
    811     li $T3, 10
    812 
    813     beq $T0, $T1, aes_gcm_dec_blocks_256
    814     beq $T0, $T2, aes_gcm_dec_blocks_192
    815     beq $T0, $T3, aes_gcm_dec_blocks_128
    816 
    817 .Ldec_end:
    818     li $PROCESSED_LEN, 0
    819     ret
    820 .size rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt,.-rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt
    821 ___
    822 
    823 $code .= <<___;
    824 .p2align 3
    825 aes_gcm_dec_blocks_128:
    826     srli $CTR, $FULL_BLOCK_LEN32, 2
    827     slli $T0, $FULL_BLOCK_LEN32, 2
    828 
    829     @{[aes_128_first_round $T0, $T1]}
    830 
    831     @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
    832 
    833 .Ldec_blocks_128:
    834     # Compute the partial tags.
    835     # The partial tags will multiply with [H^n, H^n, ..., H^n]
    836     #   [tag0, tag1, ...] =
    837     #     ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n]
    838     # We will skip the [H^n, H^n, ..., H^n] multiplication for the last round.
    839     beqz $LEN32, .Ldec_blocks_256_end
    840     @{[vghsh_vv $V20, $V16, $V24]}
    841 
    842     @{[prepare_input_and_ctr $T0]}
    843 
    844     @{[aes_128_cipher_body]}
    845 
    846     # Compute AES ctr plaintext result.
    847     @{[vxor_vv $V28, $V28, $V24]}
    848 
    849     # Store plaintext
    850     @{[vse32_v $V28, $OUTP]}
    851     add $OUTP, $OUTP, $T0
    852 
    853     j .Ldec_blocks_128
    854 .Ldec_blocks_128_end:
    855 
    856     # Add ciphertext into partial tag
    857     @{[vxor_vv $V20, $V20, $V24]}
    858 
    859     @{[store_current_ctr]}
    860 
    861     @{[compute_final_tag $T1]}
    862 
    863     # Save the final tag
    864     @{[vse32_v $V0, $XIP]}
    865 
    866     # return the processed size.
    867     slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2
    868     ret
    869 .size aes_gcm_dec_blocks_128,.-aes_gcm_dec_blocks_128
    870 ___
    871 
    872 $code .= <<___;
    873 .p2align 3
    874 aes_gcm_dec_blocks_192:
    875     srli $CTR, $FULL_BLOCK_LEN32, 2
    876     slli $T0, $FULL_BLOCK_LEN32, 2
    877 
    878     @{[aes_192_first_round $T0, $T1]}
    879 
    880     @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
    881 
    882 .Ldec_blocks_192:
    883     # Compute the partial tags.
    884     # The partial tags will multiply with [H^n, H^n, ..., H^n]
    885     #   [tag0, tag1, ...] =
    886     #     ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n]
    887     # We will skip the [H^n, H^n, ..., H^n] multiplication for the last round.
    888     beqz $LEN32, .Ldec_blocks_192_end
    889     @{[vghsh_vv $V20, $V16, $V24]}
    890 
    891     @{[prepare_input_and_ctr $T0]}
    892 
    893     @{[aes_192_cipher_body $T1]}
    894 
    895     # Compute AES ctr plaintext result.
    896     @{[vxor_vv $V28, $V28, $V24]}
    897 
    898     # Store plaintext
    899     @{[vse32_v $V28, $OUTP]}
    900     add $OUTP, $OUTP, $T0
    901 
    902     j .Ldec_blocks_192
    903 .Ldec_blocks_192_end:
    904 
    905     # Add ciphertext into partial tag
    906     @{[vxor_vv $V20, $V20, $V24]}
    907 
    908     @{[store_current_ctr]}
    909 
    910     @{[compute_final_tag $T1]}
    911 
    912     # Save the final tag
    913     @{[vse32_v $V0, $XIP]}
    914 
    915     # return the processed size.
    916     slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2
    917     ret
    918 .size aes_gcm_dec_blocks_192,.-aes_gcm_dec_blocks_192
    919 ___
    920 
    921 $code .= <<___;
    922 .p2align 3
    923 aes_gcm_dec_blocks_256:
    924     srli $CTR, $FULL_BLOCK_LEN32, 2
    925     slli $T0, $FULL_BLOCK_LEN32, 2
    926 
    927     @{[aes_256_first_round $T0, $T1]}
    928 
    929     @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
    930 
    931 .Ldec_blocks_256:
    932     # Compute the partial tags.
    933     # The partial tags will multiply with [H^n, H^n, ..., H^n]
    934     #   [tag0, tag1, ...] =
    935     #     ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n]
    936     # We will skip the [H^n, H^n, ..., H^n] multiplication for the last round.
    937     beqz $LEN32, .Ldec_blocks_256_end
    938     @{[vghsh_vv $V20, $V16, $V24]}
    939 
    940     @{[prepare_input_and_ctr $T0]}
    941 
    942     @{[aes_256_cipher_body $T1]}
    943 
    944     # Compute AES ctr plaintext result.
    945     @{[vxor_vv $V28, $V28, $V24]}
    946 
    947     # Store plaintext
    948     @{[vse32_v $V28, $OUTP]}
    949     add $OUTP, $OUTP, $T0
    950 
    951     j .Ldec_blocks_256
    952 .Ldec_blocks_256_end:
    953 
    954     # Add ciphertext into partial tag
    955     @{[vxor_vv $V20, $V20, $V24]}
    956 
    957     @{[store_current_ctr]}
    958 
    959     @{[compute_final_tag $T1]}
    960 
    961     # Save the final tag
    962     @{[vse32_v $V0, $XIP]}
    963 
    964     # return the processed size.
    965     slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2
    966     ret
    967 .size aes_gcm_dec_blocks_256,.-aes_gcm_dec_blocks_256
    968 ___
    969 
    970 }
    971 }
    972 
    973 print $code;
    974 
    975 close STDOUT or die "error closing STDOUT: $!";
    976