Home | History | Annotate | Line # | Download | only in riscv64
ghash-riscv64-zvkb-zvbc.S revision 1.1
      1 #include <machine/asm.h>
      2 .text
      3 .p2align 3
      4 .globl gcm_init_rv64i_zvkb_zvbc
      5 .type gcm_init_rv64i_zvkb_zvbc,@function
      6 gcm_init_rv64i_zvkb_zvbc:
      7     # Load/store data in reverse order.
      8     # This is needed as a part of endianness swap.
      9     add a1, a1, 8
     10     li t0, -8
     11     li t1, 63
     12     la t2, Lpolymod
     13 
     14     .word 0xc1817057 # vsetivli x0, 2, e64, m1, tu, mu
     15 
     16     .word 173404295    # vlse64.v v1, (a1), t0
     17     .word 33812743          # vle64.v v2, (t2)
     18 
     19     # Shift one left and get the carry bits.
     20     .word 2719171031     # vsrl.vx v3, v1, t1
     21     .word 2517676247         # vsll.vi v1, v1, 1
     22 
     23     # Use the fact that the polynomial degree is no more than 128,
     24     # i.e. only the LSB of the upper half could be set.
     25     # Thanks to this we don't need to do the full reduction here.
     26     # Instead simply subtract the reduction polynomial.
     27     # This idea was taken from x86 ghash implementation in OpenSSL.
     28     .word 976269911     # vslideup.vi v4, v3, 1
     29     .word 1043378647   # vslidedown.vi v3, v3, 1
     30 
     31     .word 1577136215              # vmv.v.i v0, 2
     32     .word 672268503    # vor.vv v1, v1, v4, v0.t
     33 
     34     # Need to set the mask to 3, if the carry bit is set.
     35     .word 1577156695            # vmv.v.v v0, v3
     36     .word 1577071063              # vmv.v.i v3, 0
     37     .word 1546760663      # vmerge.vim v3, v3, 3, v0
     38     .word 1577156695            # vmv.v.v v0, v3
     39 
     40     .word 739311831   # vxor.vv v1, v1, v2, v0.t
     41 
     42     .word 33910951        # vse64.v v1, (a0)
     43     ret
     44 .size gcm_init_rv64i_zvkb_zvbc,.-gcm_init_rv64i_zvkb_zvbc
     45 .text
     46 .p2align 3
     47 .globl gcm_gmult_rv64i_zvkb_zvbc
     48 .type gcm_gmult_rv64i_zvkb_zvbc,@function
     49 gcm_gmult_rv64i_zvkb_zvbc:
     50     ld t0, (a1)
     51     ld t1, 8(a1)
     52     li t2, 63
     53     la t3, Lpolymod
     54     ld t3, 8(t3)
     55 
     56     # Load/store data in reverse order.
     57     # This is needed as a part of endianness swap.
     58     add a0, a0, 8
     59     li t4, -8
     60 
     61     .word 0xc1817057 # vsetivli x0, 2, e64, m1, tu, mu
     62 
     63     .word 198537863    # vlse64.v v5, (a0), t4
     64     .word 1247060695            # vrev8.v v5, v5
     65 
     66     # Multiplication
     67 
     68     # Do two 64x64 multiplications in one go to save some time
     69     # and simplify things.
     70 
     71     # A = a1a0 (t1, t0)
     72     # B = b1b0 (v5)
     73     # C = c1c0 (256 bit)
     74     # c1 = a1b1 + (a0b1)h + (a1b0)h
     75     # c0 = a0b0 + (a0b1)l + (a1b0)h
     76 
     77     # v1 = (a0b1)l,(a0b0)l
     78     .word 844292311   # vclmul.vx v1, v5, t0
     79     # v3 = (a0b1)h,(a0b0)h
     80     .word 911401431  # vclmulh.vx v3, v5, t0
     81 
     82     # v4 = (a1b1)l,(a1b0)l
     83     .word 844325463   # vclmul.vx v4, v5, t1
     84     # v2 = (a1b1)h,(a1b0)h
     85     .word 911434071   # vclmulh.vx v2, v5, t1
     86 
     87     # Is there a better way to do this?
     88     # Would need to swap the order of elements within a vector register.
     89     .word 976270039     # vslideup.vi v5, v3, 1
     90     .word 977318743     # vslideup.vi v6, v4, 1
     91     .word 1043378647   # vslidedown.vi v3, v3, 1
     92     .word 1044427351   # vslidedown.vi v4, v4, 1
     93 
     94     .word 1577103447              # vmv.v.i v0, 1
     95     # v2 += (a0b1)h
     96     .word 740393303   # vxor.vv v2, v2, v3, v0.t
     97     # v2 += (a1b1)l
     98     .word 740426071   # vxor.vv v2, v2, v4, v0.t
     99 
    100     .word 1577136215              # vmv.v.i v0, 2
    101     # v1 += (a0b0)h,0
    102     .word 739410135   # vxor.vv v1, v1, v5, v0.t
    103     # v1 += (a1b0)l,0
    104     .word 739442903   # vxor.vv v1, v1, v6, v0.t
    105 
    106     # Now the 256bit product should be stored in (v2,v1)
    107     # v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l
    108     # v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l
    109 
    110     # Reduction
    111     # Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0]
    112     # This is a slight variation of the Gueron's Montgomery reduction.
    113     # The difference being the order of some operations has been changed,
    114     # to make a better use of vclmul(h) instructions.
    115 
    116     # First step:
    117     # c1 += (c0 * P)l
    118     # vmv.v.i v0, 2
    119     .word 940618199 # vslideup.vi v3, v1, 1, v0.t
    120     .word 809394647 # vclmul.vx v3, v3, t3, v0.t
    121     .word 739344599   # vxor.vv v1, v1, v3, v0.t
    122 
    123     # Second step:
    124     # D = d1,d0 is final result
    125     # We want:
    126     # m1 = c1 + (c1 * P)h
    127     # m0 = (c1 * P)l + (c0 * P)h + c0
    128     # d1 = c3 + m1
    129     # d0 = c2 + m0
    130 
    131     #v3 = (c1 * P)l, 0
    132     .word 807297495 # vclmul.vx v3, v1, t3, v0.t
    133     #v4 = (c1 * P)h, (c0 * P)h
    134     .word 907960919   # vclmulh.vx v4, v1, t3
    135 
    136     .word 1577103447              # vmv.v.i v0, 1
    137     .word 1043378647   # vslidedown.vi v3, v3, 1
    138 
    139     .word 772931799       # vxor.vv v1, v1, v4
    140     .word 739344599   # vxor.vv v1, v1, v3, v0.t
    141 
    142     # XOR in the upper upper part of the product
    143     .word 773882199       # vxor.vv v2, v2, v1
    144 
    145     .word 1243914583            # vrev8.v v2, v2
    146     .word 198537511    # vsse64.v v2, (a0), t4
    147     ret
    148 .size gcm_gmult_rv64i_zvkb_zvbc,.-gcm_gmult_rv64i_zvkb_zvbc
    149 .p2align 3
    150 .globl gcm_ghash_rv64i_zvkb_zvbc
    151 .type gcm_ghash_rv64i_zvkb_zvbc,@function
    152 gcm_ghash_rv64i_zvkb_zvbc:
    153     ld t0, (a1)
    154     ld t1, 8(a1)
    155     li t2, 63
    156     la t3, Lpolymod
    157     ld t3, 8(t3)
    158 
    159     # Load/store data in reverse order.
    160     # This is needed as a part of endianness swap.
    161     add a0, a0, 8
    162     add a2, a2, 8
    163     li t4, -8
    164 
    165     .word 0xc1817057 # vsetivli x0, 2, e64, m1, tu, mu
    166 
    167     .word 198537863      # vlse64.v v5, (a0), t4
    168 
    169 Lstep:
    170     # Read input data
    171     .word 198603655   # vle64.v v0, (a2)
    172     add a2, a2, 16
    173     add a3, a3, -16
    174     # XOR them into Xi
    175     .word 777224919       # vxor.vv v0, v0, v1
    176 
    177     .word 1247060695            # vrev8.v v5, v5
    178 
    179     # Multiplication
    180 
    181     # Do two 64x64 multiplications in one go to save some time
    182     # and simplify things.
    183 
    184     # A = a1a0 (t1, t0)
    185     # B = b1b0 (v5)
    186     # C = c1c0 (256 bit)
    187     # c1 = a1b1 + (a0b1)h + (a1b0)h
    188     # c0 = a0b0 + (a0b1)l + (a1b0)h
    189 
    190     # v1 = (a0b1)l,(a0b0)l
    191     .word 844292311   # vclmul.vx v1, v5, t0
    192     # v3 = (a0b1)h,(a0b0)h
    193     .word 911401431  # vclmulh.vx v3, v5, t0
    194 
    195     # v4 = (a1b1)l,(a1b0)l
    196     .word 844325463   # vclmul.vx v4, v5, t1
    197     # v2 = (a1b1)h,(a1b0)h
    198     .word 911434071   # vclmulh.vx v2, v5, t1
    199 
    200     # Is there a better way to do this?
    201     # Would need to swap the order of elements within a vector register.
    202     .word 976270039     # vslideup.vi v5, v3, 1
    203     .word 977318743     # vslideup.vi v6, v4, 1
    204     .word 1043378647   # vslidedown.vi v3, v3, 1
    205     .word 1044427351   # vslidedown.vi v4, v4, 1
    206 
    207     .word 1577103447              # vmv.v.i v0, 1
    208     # v2 += (a0b1)h
    209     .word 740393303   # vxor.vv v2, v2, v3, v0.t
    210     # v2 += (a1b1)l
    211     .word 740426071   # vxor.vv v2, v2, v4, v0.t
    212 
    213     .word 1577136215              # vmv.v.i v0, 2
    214     # v1 += (a0b0)h,0
    215     .word 739410135   # vxor.vv v1, v1, v5, v0.t
    216     # v1 += (a1b0)l,0
    217     .word 739442903   # vxor.vv v1, v1, v6, v0.t
    218 
    219     # Now the 256bit product should be stored in (v2,v1)
    220     # v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l
    221     # v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l
    222 
    223     # Reduction
    224     # Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0]
    225     # This is a slight variation of the Gueron's Montgomery reduction.
    226     # The difference being the order of some operations has been changed,
    227     # to make a better use of vclmul(h) instructions.
    228 
    229     # First step:
    230     # c1 += (c0 * P)l
    231     # vmv.v.i v0, 2
    232     .word 940618199 # vslideup.vi v3, v1, 1, v0.t
    233     .word 809394647 # vclmul.vx v3, v3, t3, v0.t
    234     .word 739344599   # vxor.vv v1, v1, v3, v0.t
    235 
    236     # Second step:
    237     # D = d1,d0 is final result
    238     # We want:
    239     # m1 = c1 + (c1 * P)h
    240     # m0 = (c1 * P)l + (c0 * P)h + c0
    241     # d1 = c3 + m1
    242     # d0 = c2 + m0
    243 
    244     #v3 = (c1 * P)l, 0
    245     .word 807297495 # vclmul.vx v3, v1, t3, v0.t
    246     #v4 = (c1 * P)h, (c0 * P)h
    247     .word 907960919   # vclmulh.vx v4, v1, t3
    248 
    249     .word 1577103447              # vmv.v.i v0, 1
    250     .word 1043378647   # vslidedown.vi v3, v3, 1
    251 
    252     .word 772931799       # vxor.vv v1, v1, v4
    253     .word 739344599   # vxor.vv v1, v1, v3, v0.t
    254 
    255     # XOR in the upper upper part of the product
    256     .word 773882199       # vxor.vv v2, v2, v1
    257 
    258     .word 1243914967            # vrev8.v v2, v2
    259 
    260     bnez a3, Lstep
    261 
    262     .word 198537895    # vsse64.v v2, (a0), t4
    263     ret
    264 .size gcm_ghash_rv64i_zvkb_zvbc,.-gcm_ghash_rv64i_zvkb_zvbc
    265 .p2align 4
    266 Lpolymod:
    267         .dword 0x0000000000000001
    268         .dword 0xc200000000000000
    269 .size Lpolymod,.-Lpolymod
    270