Home | History | Annotate | Line # | Download | only in asm
      1 #! /usr/bin/env perl
      2 # This file is dual-licensed, meaning that you can use it under your
      3 # choice of either of the following two licenses:
      4 #
      5 # Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
      6 #
      7 # Licensed under the Apache License 2.0 (the "License"). You can obtain
      8 # a copy in the file LICENSE in the source distribution or at
      9 # https://www.openssl.org/source/license.html
     10 #
     11 # or
     12 #
     13 # Copyright (c) 2023, Christoph Mllner <christoph.muellner (at] vrull.eu>
     14 # Copyright (c) 2023, Phoebe Chen <phoebe.chen (at] sifive.com>
     15 # All rights reserved.
     16 #
     17 # Redistribution and use in source and binary forms, with or without
     18 # modification, are permitted provided that the following conditions
     19 # are met:
     20 # 1. Redistributions of source code must retain the above copyright
     21 #    notice, this list of conditions and the following disclaimer.
     22 # 2. Redistributions in binary form must reproduce the above copyright
     23 #    notice, this list of conditions and the following disclaimer in the
     24 #    documentation and/or other materials provided with the distribution.
     25 #
     26 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     27 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     28 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     29 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     30 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     31 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     32 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     33 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     34 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     35 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     36 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     37 
     38 # The generated code of this file depends on the following RISC-V extensions:
     39 # - RV64I
     40 # - RISC-V Vector ('V') with VLEN >= 128
     41 # - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
     42 # - RISC-V Vector SHA-2 Secure Hash extension ('Zvknha' or 'Zvknhb')
     43 
     44 use strict;
     45 use warnings;
     46 
     47 use FindBin qw($Bin);
     48 use lib "$Bin";
     49 use lib "$Bin/../../perlasm";
     50 use riscv;
     51 
     52 # $output is the last argument if it looks like a file (it has an extension)
     53 # $flavour is the first argument if it doesn't look like a file
     54 my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
     55 my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
     56 
     57 $output and open STDOUT,">$output";
     58 
     59 my $code=<<___;
     60 .text
     61 ___
     62 
     63 my ($V0, $V1, $V2, $V3, $V4, $V5, $V6, $V7,
     64     $V8, $V9, $V10, $V11, $V12, $V13, $V14, $V15,
     65     $V16, $V17, $V18, $V19, $V20, $V21, $V22, $V23,
     66     $V24, $V25, $V26, $V27, $V28, $V29, $V30, $V31,
     67 ) = map("v$_",(0..31));
     68 
     69 my $K256 = "K256";
     70 
     71 # Function arguments
     72 my ($H, $INP, $LEN, $KT, $H2, $INDEX_PATTERN) = ("a0", "a1", "a2", "a3", "t3", "t4");
     73 
     74 sub sha_256_load_constant {
     75     my $code=<<___;
     76     la $KT, $K256 # Load round constants K256
     77     @{[vle32_v $V10, $KT]}
     78     addi $KT, $KT, 16
     79     @{[vle32_v $V11, $KT]}
     80     addi $KT, $KT, 16
     81     @{[vle32_v $V12, $KT]}
     82     addi $KT, $KT, 16
     83     @{[vle32_v $V13, $KT]}
     84     addi $KT, $KT, 16
     85     @{[vle32_v $V14, $KT]}
     86     addi $KT, $KT, 16
     87     @{[vle32_v $V15, $KT]}
     88     addi $KT, $KT, 16
     89     @{[vle32_v $V16, $KT]}
     90     addi $KT, $KT, 16
     91     @{[vle32_v $V17, $KT]}
     92     addi $KT, $KT, 16
     93     @{[vle32_v $V18, $KT]}
     94     addi $KT, $KT, 16
     95     @{[vle32_v $V19, $KT]}
     96     addi $KT, $KT, 16
     97     @{[vle32_v $V20, $KT]}
     98     addi $KT, $KT, 16
     99     @{[vle32_v $V21, $KT]}
    100     addi $KT, $KT, 16
    101     @{[vle32_v $V22, $KT]}
    102     addi $KT, $KT, 16
    103     @{[vle32_v $V23, $KT]}
    104     addi $KT, $KT, 16
    105     @{[vle32_v $V24, $KT]}
    106     addi $KT, $KT, 16
    107     @{[vle32_v $V25, $KT]}
    108 ___
    109 
    110     return $code;
    111 }
    112 
    113 ################################################################################
    114 # void sha256_block_data_order_zvkb_zvknha_or_zvknhb(void *c, const void *p, size_t len)
    115 $code .= <<___;
    116 .p2align 2
    117 .globl sha256_block_data_order_zvkb_zvknha_or_zvknhb
    118 .type   sha256_block_data_order_zvkb_zvknha_or_zvknhb,\@function
    119 sha256_block_data_order_zvkb_zvknha_or_zvknhb:
    120     @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
    121 
    122     @{[sha_256_load_constant]}
    123 
    124     # H is stored as {a,b,c,d},{e,f,g,h}, but we need {f,e,b,a},{h,g,d,c}
    125     # The dst vtype is e32m1 and the index vtype is e8mf4.
    126     # We use index-load with the following index pattern at v26.
    127     #   i8 index:
    128     #     20, 16, 4, 0
    129     # Instead of setting the i8 index, we could use a single 32bit
    130     # little-endian value to cover the 4xi8 index.
    131     #   i32 value:
    132     #     0x 00 04 10 14
    133     li $INDEX_PATTERN, 0x00041014
    134     @{[vsetivli "zero", 1, "e32", "m1", "ta", "ma"]}
    135     @{[vmv_v_x $V26, $INDEX_PATTERN]}
    136 
    137     addi $H2, $H, 8
    138 
    139     # Use index-load to get {f,e,b,a},{h,g,d,c}
    140     @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
    141     @{[vluxei8_v $V6, $H, $V26]}
    142     @{[vluxei8_v $V7, $H2, $V26]}
    143 
    144     # Setup v0 mask for the vmerge to replace the first word (idx==0) in key-scheduling.
    145     # The AVL is 4 in SHA, so we could use a single e8(8 element masking) for masking.
    146     @{[vsetivli "zero", 1, "e8", "m1", "ta", "ma"]}
    147     @{[vmv_v_i $V0, 0x01]}
    148 
    149     @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
    150 
    151 L_round_loop:
    152     # Decrement length by 1
    153     add $LEN, $LEN, -1
    154 
    155     # Keep the current state as we need it later: H' = H+{a',b',c',...,h'}.
    156     @{[vmv_v_v $V30, $V6]}
    157     @{[vmv_v_v $V31, $V7]}
    158 
    159     # Load the 512-bits of the message block in v1-v4 and perform
    160     # an endian swap on each 4 bytes element.
    161     @{[vle32_v $V1, $INP]}
    162     @{[vrev8_v $V1, $V1]}
    163     add $INP, $INP, 16
    164     @{[vle32_v $V2, $INP]}
    165     @{[vrev8_v $V2, $V2]}
    166     add $INP, $INP, 16
    167     @{[vle32_v $V3, $INP]}
    168     @{[vrev8_v $V3, $V3]}
    169     add $INP, $INP, 16
    170     @{[vle32_v $V4, $INP]}
    171     @{[vrev8_v $V4, $V4]}
    172     add $INP, $INP, 16
    173 
    174     # Quad-round 0 (+0, Wt from oldest to newest in v1->v2->v3->v4)
    175     @{[vadd_vv $V5, $V10, $V1]}
    176     @{[vsha2cl_vv $V7, $V6, $V5]}
    177     @{[vsha2ch_vv $V6, $V7, $V5]}
    178     @{[vmerge_vvm $V5, $V3, $V2, $V0]}
    179     @{[vsha2ms_vv $V1, $V5, $V4]}  # Generate W[19:16]
    180 
    181     # Quad-round 1 (+1, v2->v3->v4->v1)
    182     @{[vadd_vv $V5, $V11, $V2]}
    183     @{[vsha2cl_vv $V7, $V6, $V5]}
    184     @{[vsha2ch_vv $V6, $V7, $V5]}
    185     @{[vmerge_vvm $V5, $V4, $V3, $V0]}
    186     @{[vsha2ms_vv $V2, $V5, $V1]}  # Generate W[23:20]
    187 
    188     # Quad-round 2 (+2, v3->v4->v1->v2)
    189     @{[vadd_vv $V5, $V12, $V3]}
    190     @{[vsha2cl_vv $V7, $V6, $V5]}
    191     @{[vsha2ch_vv $V6, $V7, $V5]}
    192     @{[vmerge_vvm $V5, $V1, $V4, $V0]}
    193     @{[vsha2ms_vv $V3, $V5, $V2]}  # Generate W[27:24]
    194 
    195     # Quad-round 3 (+3, v4->v1->v2->v3)
    196     @{[vadd_vv $V5, $V13, $V4]}
    197     @{[vsha2cl_vv $V7, $V6, $V5]}
    198     @{[vsha2ch_vv $V6, $V7, $V5]}
    199     @{[vmerge_vvm $V5, $V2, $V1, $V0]}
    200     @{[vsha2ms_vv $V4, $V5, $V3]}  # Generate W[31:28]
    201 
    202     # Quad-round 4 (+0, v1->v2->v3->v4)
    203     @{[vadd_vv $V5, $V14, $V1]}
    204     @{[vsha2cl_vv $V7, $V6, $V5]}
    205     @{[vsha2ch_vv $V6, $V7, $V5]}
    206     @{[vmerge_vvm $V5, $V3, $V2, $V0]}
    207     @{[vsha2ms_vv $V1, $V5, $V4]}  # Generate W[35:32]
    208 
    209     # Quad-round 5 (+1, v2->v3->v4->v1)
    210     @{[vadd_vv $V5, $V15, $V2]}
    211     @{[vsha2cl_vv $V7, $V6, $V5]}
    212     @{[vsha2ch_vv $V6, $V7, $V5]}
    213     @{[vmerge_vvm $V5, $V4, $V3, $V0]}
    214     @{[vsha2ms_vv $V2, $V5, $V1]}  # Generate W[39:36]
    215 
    216     # Quad-round 6 (+2, v3->v4->v1->v2)
    217     @{[vadd_vv $V5, $V16, $V3]}
    218     @{[vsha2cl_vv $V7, $V6, $V5]}
    219     @{[vsha2ch_vv $V6, $V7, $V5]}
    220     @{[vmerge_vvm $V5, $V1, $V4, $V0]}
    221     @{[vsha2ms_vv $V3, $V5, $V2]}  # Generate W[43:40]
    222 
    223     # Quad-round 7 (+3, v4->v1->v2->v3)
    224     @{[vadd_vv $V5, $V17, $V4]}
    225     @{[vsha2cl_vv $V7, $V6, $V5]}
    226     @{[vsha2ch_vv $V6, $V7, $V5]}
    227     @{[vmerge_vvm $V5, $V2, $V1, $V0]}
    228     @{[vsha2ms_vv $V4, $V5, $V3]}  # Generate W[47:44]
    229 
    230     # Quad-round 8 (+0, v1->v2->v3->v4)
    231     @{[vadd_vv $V5, $V18, $V1]}
    232     @{[vsha2cl_vv $V7, $V6, $V5]}
    233     @{[vsha2ch_vv $V6, $V7, $V5]}
    234     @{[vmerge_vvm $V5, $V3, $V2, $V0]}
    235     @{[vsha2ms_vv $V1, $V5, $V4]}  # Generate W[51:48]
    236 
    237     # Quad-round 9 (+1, v2->v3->v4->v1)
    238     @{[vadd_vv $V5, $V19, $V2]}
    239     @{[vsha2cl_vv $V7, $V6, $V5]}
    240     @{[vsha2ch_vv $V6, $V7, $V5]}
    241     @{[vmerge_vvm $V5, $V4, $V3, $V0]}
    242     @{[vsha2ms_vv $V2, $V5, $V1]}  # Generate W[55:52]
    243 
    244     # Quad-round 10 (+2, v3->v4->v1->v2)
    245     @{[vadd_vv $V5, $V20, $V3]}
    246     @{[vsha2cl_vv $V7, $V6, $V5]}
    247     @{[vsha2ch_vv $V6, $V7, $V5]}
    248     @{[vmerge_vvm $V5, $V1, $V4, $V0]}
    249     @{[vsha2ms_vv $V3, $V5, $V2]}  # Generate W[59:56]
    250 
    251     # Quad-round 11 (+3, v4->v1->v2->v3)
    252     @{[vadd_vv $V5, $V21, $V4]}
    253     @{[vsha2cl_vv $V7, $V6, $V5]}
    254     @{[vsha2ch_vv $V6, $V7, $V5]}
    255     @{[vmerge_vvm $V5, $V2, $V1, $V0]}
    256     @{[vsha2ms_vv $V4, $V5, $V3]}  # Generate W[63:60]
    257 
    258     # Quad-round 12 (+0, v1->v2->v3->v4)
    259     # Note that we stop generating new message schedule words (Wt, v1-13)
    260     # as we already generated all the words we end up consuming (i.e., W[63:60]).
    261     @{[vadd_vv $V5, $V22, $V1]}
    262     @{[vsha2cl_vv $V7, $V6, $V5]}
    263     @{[vsha2ch_vv $V6, $V7, $V5]}
    264 
    265     # Quad-round 13 (+1, v2->v3->v4->v1)
    266     @{[vadd_vv $V5, $V23, $V2]}
    267     @{[vsha2cl_vv $V7, $V6, $V5]}
    268     @{[vsha2ch_vv $V6, $V7, $V5]}
    269 
    270     # Quad-round 14 (+2, v3->v4->v1->v2)
    271     @{[vadd_vv $V5, $V24, $V3]}
    272     @{[vsha2cl_vv $V7, $V6, $V5]}
    273     @{[vsha2ch_vv $V6, $V7, $V5]}
    274 
    275     # Quad-round 15 (+3, v4->v1->v2->v3)
    276     @{[vadd_vv $V5, $V25, $V4]}
    277     @{[vsha2cl_vv $V7, $V6, $V5]}
    278     @{[vsha2ch_vv $V6, $V7, $V5]}
    279 
    280     # H' = H+{a',b',c',...,h'}
    281     @{[vadd_vv $V6, $V30, $V6]}
    282     @{[vadd_vv $V7, $V31, $V7]}
    283     bnez $LEN, L_round_loop
    284 
    285     # Store {f,e,b,a},{h,g,d,c} back to {a,b,c,d},{e,f,g,h}.
    286     @{[vsuxei8_v $V6, $H, $V26]}
    287     @{[vsuxei8_v $V7, $H2, $V26]}
    288 
    289     ret
    290 .size sha256_block_data_order_zvkb_zvknha_or_zvknhb,.-sha256_block_data_order_zvkb_zvknha_or_zvknhb
    291 
    292 .p2align 2
    293 .type $K256,\@object
    294 $K256:
    295     .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
    296     .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
    297     .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
    298     .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
    299     .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
    300     .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
    301     .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
    302     .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
    303     .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
    304     .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
    305     .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
    306     .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
    307     .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
    308     .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
    309     .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
    310     .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
    311 .size $K256,.-$K256
    312 ___
    313 
    314 print $code;
    315 
    316 close STDOUT or die "error closing STDOUT: $!";
    317