ghash-riscv64-zvkb-zvbc.S revision 1.1 1 #include <machine/asm.h>
2 .text
3 .p2align 3
4 .globl gcm_init_rv64i_zvkb_zvbc
5 .type gcm_init_rv64i_zvkb_zvbc,@function
6 gcm_init_rv64i_zvkb_zvbc:
7 # Load/store data in reverse order.
8 # This is needed as a part of endianness swap.
9 add a1, a1, 8
10 li t0, -8
11 li t1, 63
12 la t2, Lpolymod
13
14 .word 0xc1817057 # vsetivli x0, 2, e64, m1, tu, mu
15
16 .word 173404295 # vlse64.v v1, (a1), t0
17 .word 33812743 # vle64.v v2, (t2)
18
19 # Shift one left and get the carry bits.
20 .word 2719171031 # vsrl.vx v3, v1, t1
21 .word 2517676247 # vsll.vi v1, v1, 1
22
23 # Use the fact that the polynomial degree is no more than 128,
24 # i.e. only the LSB of the upper half could be set.
25 # Thanks to this we don't need to do the full reduction here.
26 # Instead simply subtract the reduction polynomial.
27 # This idea was taken from x86 ghash implementation in OpenSSL.
28 .word 976269911 # vslideup.vi v4, v3, 1
29 .word 1043378647 # vslidedown.vi v3, v3, 1
30
31 .word 1577136215 # vmv.v.i v0, 2
32 .word 672268503 # vor.vv v1, v1, v4, v0.t
33
34 # Need to set the mask to 3, if the carry bit is set.
35 .word 1577156695 # vmv.v.v v0, v3
36 .word 1577071063 # vmv.v.i v3, 0
37 .word 1546760663 # vmerge.vim v3, v3, 3, v0
38 .word 1577156695 # vmv.v.v v0, v3
39
40 .word 739311831 # vxor.vv v1, v1, v2, v0.t
41
42 .word 33910951 # vse64.v v1, (a0)
43 ret
44 .size gcm_init_rv64i_zvkb_zvbc,.-gcm_init_rv64i_zvkb_zvbc
45 .text
46 .p2align 3
47 .globl gcm_gmult_rv64i_zvkb_zvbc
48 .type gcm_gmult_rv64i_zvkb_zvbc,@function
49 gcm_gmult_rv64i_zvkb_zvbc:
50 ld t0, (a1)
51 ld t1, 8(a1)
52 li t2, 63
53 la t3, Lpolymod
54 ld t3, 8(t3)
55
56 # Load/store data in reverse order.
57 # This is needed as a part of endianness swap.
58 add a0, a0, 8
59 li t4, -8
60
61 .word 0xc1817057 # vsetivli x0, 2, e64, m1, tu, mu
62
63 .word 198537863 # vlse64.v v5, (a0), t4
64 .word 1247060695 # vrev8.v v5, v5
65
66 # Multiplication
67
68 # Do two 64x64 multiplications in one go to save some time
69 # and simplify things.
70
71 # A = a1a0 (t1, t0)
72 # B = b1b0 (v5)
73 # C = c1c0 (256 bit)
74 # c1 = a1b1 + (a0b1)h + (a1b0)h
75 # c0 = a0b0 + (a0b1)l + (a1b0)h
76
77 # v1 = (a0b1)l,(a0b0)l
78 .word 844292311 # vclmul.vx v1, v5, t0
79 # v3 = (a0b1)h,(a0b0)h
80 .word 911401431 # vclmulh.vx v3, v5, t0
81
82 # v4 = (a1b1)l,(a1b0)l
83 .word 844325463 # vclmul.vx v4, v5, t1
84 # v2 = (a1b1)h,(a1b0)h
85 .word 911434071 # vclmulh.vx v2, v5, t1
86
87 # Is there a better way to do this?
88 # Would need to swap the order of elements within a vector register.
89 .word 976270039 # vslideup.vi v5, v3, 1
90 .word 977318743 # vslideup.vi v6, v4, 1
91 .word 1043378647 # vslidedown.vi v3, v3, 1
92 .word 1044427351 # vslidedown.vi v4, v4, 1
93
94 .word 1577103447 # vmv.v.i v0, 1
95 # v2 += (a0b1)h
96 .word 740393303 # vxor.vv v2, v2, v3, v0.t
97 # v2 += (a1b1)l
98 .word 740426071 # vxor.vv v2, v2, v4, v0.t
99
100 .word 1577136215 # vmv.v.i v0, 2
101 # v1 += (a0b0)h,0
102 .word 739410135 # vxor.vv v1, v1, v5, v0.t
103 # v1 += (a1b0)l,0
104 .word 739442903 # vxor.vv v1, v1, v6, v0.t
105
106 # Now the 256bit product should be stored in (v2,v1)
107 # v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l
108 # v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l
109
110 # Reduction
111 # Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0]
112 # This is a slight variation of the Gueron's Montgomery reduction.
113 # The difference being the order of some operations has been changed,
114 # to make a better use of vclmul(h) instructions.
115
116 # First step:
117 # c1 += (c0 * P)l
118 # vmv.v.i v0, 2
119 .word 940618199 # vslideup.vi v3, v1, 1, v0.t
120 .word 809394647 # vclmul.vx v3, v3, t3, v0.t
121 .word 739344599 # vxor.vv v1, v1, v3, v0.t
122
123 # Second step:
124 # D = d1,d0 is final result
125 # We want:
126 # m1 = c1 + (c1 * P)h
127 # m0 = (c1 * P)l + (c0 * P)h + c0
128 # d1 = c3 + m1
129 # d0 = c2 + m0
130
131 #v3 = (c1 * P)l, 0
132 .word 807297495 # vclmul.vx v3, v1, t3, v0.t
133 #v4 = (c1 * P)h, (c0 * P)h
134 .word 907960919 # vclmulh.vx v4, v1, t3
135
136 .word 1577103447 # vmv.v.i v0, 1
137 .word 1043378647 # vslidedown.vi v3, v3, 1
138
139 .word 772931799 # vxor.vv v1, v1, v4
140 .word 739344599 # vxor.vv v1, v1, v3, v0.t
141
142 # XOR in the upper upper part of the product
143 .word 773882199 # vxor.vv v2, v2, v1
144
145 .word 1243914583 # vrev8.v v2, v2
146 .word 198537511 # vsse64.v v2, (a0), t4
147 ret
148 .size gcm_gmult_rv64i_zvkb_zvbc,.-gcm_gmult_rv64i_zvkb_zvbc
149 .p2align 3
150 .globl gcm_ghash_rv64i_zvkb_zvbc
151 .type gcm_ghash_rv64i_zvkb_zvbc,@function
152 gcm_ghash_rv64i_zvkb_zvbc:
153 ld t0, (a1)
154 ld t1, 8(a1)
155 li t2, 63
156 la t3, Lpolymod
157 ld t3, 8(t3)
158
159 # Load/store data in reverse order.
160 # This is needed as a part of endianness swap.
161 add a0, a0, 8
162 add a2, a2, 8
163 li t4, -8
164
165 .word 0xc1817057 # vsetivli x0, 2, e64, m1, tu, mu
166
167 .word 198537863 # vlse64.v v5, (a0), t4
168
169 Lstep:
170 # Read input data
171 .word 198603655 # vle64.v v0, (a2)
172 add a2, a2, 16
173 add a3, a3, -16
174 # XOR them into Xi
175 .word 777224919 # vxor.vv v0, v0, v1
176
177 .word 1247060695 # vrev8.v v5, v5
178
179 # Multiplication
180
181 # Do two 64x64 multiplications in one go to save some time
182 # and simplify things.
183
184 # A = a1a0 (t1, t0)
185 # B = b1b0 (v5)
186 # C = c1c0 (256 bit)
187 # c1 = a1b1 + (a0b1)h + (a1b0)h
188 # c0 = a0b0 + (a0b1)l + (a1b0)h
189
190 # v1 = (a0b1)l,(a0b0)l
191 .word 844292311 # vclmul.vx v1, v5, t0
192 # v3 = (a0b1)h,(a0b0)h
193 .word 911401431 # vclmulh.vx v3, v5, t0
194
195 # v4 = (a1b1)l,(a1b0)l
196 .word 844325463 # vclmul.vx v4, v5, t1
197 # v2 = (a1b1)h,(a1b0)h
198 .word 911434071 # vclmulh.vx v2, v5, t1
199
200 # Is there a better way to do this?
201 # Would need to swap the order of elements within a vector register.
202 .word 976270039 # vslideup.vi v5, v3, 1
203 .word 977318743 # vslideup.vi v6, v4, 1
204 .word 1043378647 # vslidedown.vi v3, v3, 1
205 .word 1044427351 # vslidedown.vi v4, v4, 1
206
207 .word 1577103447 # vmv.v.i v0, 1
208 # v2 += (a0b1)h
209 .word 740393303 # vxor.vv v2, v2, v3, v0.t
210 # v2 += (a1b1)l
211 .word 740426071 # vxor.vv v2, v2, v4, v0.t
212
213 .word 1577136215 # vmv.v.i v0, 2
214 # v1 += (a0b0)h,0
215 .word 739410135 # vxor.vv v1, v1, v5, v0.t
216 # v1 += (a1b0)l,0
217 .word 739442903 # vxor.vv v1, v1, v6, v0.t
218
219 # Now the 256bit product should be stored in (v2,v1)
220 # v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l
221 # v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l
222
223 # Reduction
224 # Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0]
225 # This is a slight variation of the Gueron's Montgomery reduction.
226 # The difference being the order of some operations has been changed,
227 # to make a better use of vclmul(h) instructions.
228
229 # First step:
230 # c1 += (c0 * P)l
231 # vmv.v.i v0, 2
232 .word 940618199 # vslideup.vi v3, v1, 1, v0.t
233 .word 809394647 # vclmul.vx v3, v3, t3, v0.t
234 .word 739344599 # vxor.vv v1, v1, v3, v0.t
235
236 # Second step:
237 # D = d1,d0 is final result
238 # We want:
239 # m1 = c1 + (c1 * P)h
240 # m0 = (c1 * P)l + (c0 * P)h + c0
241 # d1 = c3 + m1
242 # d0 = c2 + m0
243
244 #v3 = (c1 * P)l, 0
245 .word 807297495 # vclmul.vx v3, v1, t3, v0.t
246 #v4 = (c1 * P)h, (c0 * P)h
247 .word 907960919 # vclmulh.vx v4, v1, t3
248
249 .word 1577103447 # vmv.v.i v0, 1
250 .word 1043378647 # vslidedown.vi v3, v3, 1
251
252 .word 772931799 # vxor.vv v1, v1, v4
253 .word 739344599 # vxor.vv v1, v1, v3, v0.t
254
255 # XOR in the upper upper part of the product
256 .word 773882199 # vxor.vv v2, v2, v1
257
258 .word 1243914967 # vrev8.v v2, v2
259
260 bnez a3, Lstep
261
262 .word 198537895 # vsse64.v v2, (a0), t4
263 ret
264 .size gcm_ghash_rv64i_zvkb_zvbc,.-gcm_ghash_rv64i_zvkb_zvbc
265 .p2align 4
266 Lpolymod:
267 .dword 0x0000000000000001
268 .dword 0xc200000000000000
269 .size Lpolymod,.-Lpolymod
270