1 #include <machine/asm.h> 2 .text 3 .p2align 3 4 .globl ChaCha20_ctr32_v_zbb_zvkb 5 .type ChaCha20_ctr32_v_zbb_zvkb,@function 6 ChaCha20_ctr32_v_zbb_zvkb: 7 addi sp, sp, -96 8 sd s0, 0(sp) 9 sd s1, 8(sp) 10 sd s2, 16(sp) 11 sd s3, 24(sp) 12 sd s4, 32(sp) 13 sd s5, 40(sp) 14 sd s6, 48(sp) 15 sd s7, 56(sp) 16 sd s8, 64(sp) 17 sd s9, 72(sp) 18 sd s10, 80(sp) 19 sd s11, 88(sp) 20 addi sp, sp, -64 21 22 lw t2, 0(a4) 23 24 .Lblock_loop: 25 # We will use the scalar ALU for 1 chacha block. 26 srli t3, a2, 6 27 .word 219050839 28 slli t4, t1, 6 29 bltu t4, a2, 1f 30 # Since there is no more chacha block existed, we need to split 1 block 31 # from vector ALU. 32 addi t4, t1, -1 33 .word 219083607 34 1: 35 36 #### chacha block data 37 # init chacha const states into v0~v3 38 # "expa" little endian 39 li a5, 0x61707865 40 .word 1577566295 41 # "nd 3" little endian 42 li a6, 0x3320646e 43 .word 1577599191 44 # "2-by" little endian 45 li a7, 0x79622d32 46 .word 1577632087 47 # "te k" little endian 48 li s0, 0x6b206574 49 lw s1, 0(a3) 50 .word 1577337303 51 52 # init chacha key states into v4~v11 53 lw s2, 4(a3) 54 .word 1577370199 55 lw s3, 8(a3) 56 .word 1577665239 57 lw s4, 12(a3) 58 .word 1577698135 59 lw s5, 16(a3) 60 .word 1577731031 61 lw s6, 20(a3) 62 .word 1577763927 63 lw s7, 24(a3) 64 .word 1577796823 65 lw s8, 28(a3) 66 .word 1577829719 67 .word 1577862615 68 69 # init chacha key states into v12~v13 70 lw s10, 4(a4) 71 .word 1376298583 72 lw s11, 8(a4) 73 .word 46384727 74 lw t0, 12(a4) 75 .word 1577928407 76 add s9, t2, t1 77 78 # init chacha nonce states into v14~v15 79 .word 1577961303 80 .word 1577240535 81 82 li t3, 64 83 # load the top-half of input data into v16~v23 84 .word 3955615751 85 86 # till now in block_loop, we used: 87 # - v0~v15 for chacha states. 88 # - v16~v23 for top-half of input data. 89 # - v24~v31 haven't been used yet. 90 91 # 20 round groups 92 li t3, 10 93 .Lround_loop: 94 # we can use v24~v31 as temporary registers in round_loop. 95 addi t3, t3, -1 96 # a += b; d ^= a; d <<<= 16; 97 .word 33685591 98 add a5, a5, s1 99 .word 34767063 100 add a6, a6, s2 101 .word 35848535 102 add a7, a7, s3 103 .word 36930007 104 add s0, s0, s4 105 .word 784336471 106 xor s9, s9, a5 107 .word 785417943 108 xor s10, s10, a6 109 .word 786499415 110 xor s11, s11, a7 111 .word 787580887 112 xor t0, t0, s0 113 .word 1388852823 114 .word 1628232859 115 .word 1389901527 116 .word 1628265755 117 .word 1390950231 118 .word 1628298651 119 .word 1391998935 120 .word 1627574939 121 122 # c += d; b ^= c; b <<<= 12; 123 .word 42337367 124 add s5, s5, s9 125 .word 43418839 126 add s6, s6, s10 127 .word 44500311 128 add s7, s7, s11 129 .word 45581783 130 add s8, s8, t0 131 .word 776208983 132 xor s1, s1, s5 133 .word 777290455 134 xor s2, s2, s6 135 .word 778371927 136 xor s3, s3, s7 137 .word 779453399 138 xor s4, s4, s8 139 .word 1380594263 140 .word 1631900827 141 .word 1381642967 142 .word 1632196891 143 .word 1382691671 144 .word 1632229787 145 .word 1383740375 146 .word 1632262683 147 148 # a += b; d ^= a; d <<<= 8; 149 .word 33685591 150 add a5, a5, s1 151 .word 34767063 152 add a6, a6, s2 153 .word 35848535 154 add a7, a7, s3 155 .word 36930007 156 add s0, s0, s4 157 .word 784336471 158 xor s9, s9, a5 159 .word 785417943 160 xor s10, s10, a6 161 .word 786499415 162 xor s11, s11, a7 163 .word 787580887 164 xor t0, t0, s0 165 .word 1389114967 166 .word 1636621467 167 .word 1390163671 168 .word 1636654363 169 .word 1391212375 170 .word 1636687259 171 .word 1392261079 172 .word 1635963547 173 174 # c += d; b ^= c; b <<<= 7; 175 .word 42337367 176 add s5, s5, s9 177 .word 43418839 178 add s6, s6, s10 179 .word 44500311 180 add s7, s7, s11 181 .word 45581783 182 add s8, s8, t0 183 .word 776208983 184 xor s1, s1, s5 185 .word 777290455 186 xor s2, s2, s6 187 .word 778371927 188 xor s3, s3, s7 189 .word 779453399 190 xor s4, s4, s8 191 .word 1380758103 192 .word 1637143707 193 .word 1381806807 194 .word 1637439771 195 .word 1382855511 196 .word 1637472667 197 .word 1383904215 198 .word 1637505563 199 200 201 # a += b; d ^= a; d <<<= 16; 202 .word 36831703 203 add s0, s0, s1 204 .word 33718359 205 add a5, a5, s2 206 .word 34799831 207 add a6, a6, s3 208 .word 35881303 209 add a7, a7, s4 210 .word 786532183 211 xor s11, s11, s0 212 .word 787482583 213 xor t0, t0, a5 214 .word 784369239 215 xor s9, s9, a6 216 .word 785450711 217 xor s10, s10, a7 218 .word 1390950231 219 .word 1628298651 220 .word 1391998935 221 .word 1627574939 222 .word 1388852823 223 .word 1628232859 224 .word 1389901527 225 .word 1628265755 226 227 # c += d; b ^= c; b <<<= 12; 228 .word 43451607 229 add s6, s6, s11 230 .word 44533079 231 add s7, s7, t0 232 .word 45483479 233 add s8, s8, s9 234 .word 42370135 235 add s5, s5, s10 236 .word 776241751 237 xor s1, s1, s6 238 .word 777323223 239 xor s2, s2, s7 240 .word 778404695 241 xor s3, s3, s8 242 .word 779355095 243 xor s4, s4, s5 244 .word 1380594263 245 .word 1631900827 246 .word 1381642967 247 .word 1632196891 248 .word 1382691671 249 .word 1632229787 250 .word 1383740375 251 .word 1632262683 252 253 # a += b; d ^= a; d <<<= 8; 254 .word 36831703 255 add s0, s0, s1 256 .word 33718359 257 add a5, a5, s2 258 .word 34799831 259 add a6, a6, s3 260 .word 35881303 261 add a7, a7, s4 262 .word 786532183 263 xor s11, s11, s0 264 .word 787482583 265 xor t0, t0, a5 266 .word 784369239 267 xor s9, s9, a6 268 .word 785450711 269 xor s10, s10, a7 270 .word 1391212375 271 .word 1636687259 272 .word 1392261079 273 .word 1635963547 274 .word 1389114967 275 .word 1636621467 276 .word 1390163671 277 .word 1636654363 278 279 # c += d; b ^= c; b <<<= 7; 280 .word 43451607 281 add s6, s6, s11 282 .word 44533079 283 add s7, s7, t0 284 .word 45483479 285 add s8, s8, s9 286 .word 42370135 287 add s5, s5, s10 288 .word 776241751 289 xor s1, s1, s6 290 .word 777323223 291 xor s2, s2, s7 292 .word 778404695 293 xor s3, s3, s8 294 .word 779355095 295 xor s4, s4, s5 296 .word 1380758103 297 .word 1637143707 298 .word 1381806807 299 .word 1637439771 300 .word 1382855511 301 .word 1637472667 302 .word 1383904215 303 .word 1637505563 304 305 306 bnez t3, .Lround_loop 307 308 li t3, 64 309 # load the bottom-half of input data into v24~v31 310 addi t4, a1, 32 311 .word 3956206599 312 313 # now, there are no free vector registers until the round_loop exits. 314 315 # add chacha top-half initial block states 316 # "expa" little endian 317 li t3, 0x61707865 318 .word 34488407 319 add a5, a5, t3 320 # "nd 3" little endian 321 li t4, 0x3320646e 322 .word 35569879 323 add a6, a6, t4 324 lw t3, 0(a3) 325 # "2-by" little endian 326 li t5, 0x79622d32 327 .word 36651351 328 add a7, a7, t5 329 lw t4, 4(a3) 330 # "te k" little endian 331 li t6, 0x6b206574 332 .word 37732823 333 add s0, s0, t6 334 lw t5, 8(a3) 335 .word 38683223 336 add s1, s1, t3 337 lw t6, 12(a3) 338 .word 39764695 339 add s2, s2, t4 340 .word 40846167 341 add s3, s3, t5 342 .word 41927639 343 add s4, s4, t6 344 345 # xor with the top-half input 346 .word 788531287 347 sw a5, 0(sp) 348 sw a6, 4(sp) 349 .word 789612759 350 sw a7, 8(sp) 351 sw s0, 12(sp) 352 .word 790694231 353 sw s1, 16(sp) 354 sw s2, 20(sp) 355 .word 791775703 356 sw s3, 24(sp) 357 sw s4, 28(sp) 358 .word 792857175 359 lw t3, 16(a3) 360 .word 793938647 361 lw t4, 20(a3) 362 .word 795020119 363 lw t5, 24(a3) 364 .word 796101591 365 366 # save the top-half of output from v16~v23 367 li t6, 64 368 .word 3958728743 369 370 # add chacha bottom-half initial block states 371 .word 42878039 372 add s5, s5, t3 373 lw t6, 28(a3) 374 .word 43959511 375 add s6, s6, t4 376 lw t3, 4(a4) 377 .word 45040983 378 add s7, s7, t5 379 lw t4, 8(a4) 380 .word 46122455 381 add s8, s8, t6 382 lw t5, 12(a4) 383 .word 1376297047 384 add s9, s9, t2 385 .word 46384727 386 add s9, s9, t1 387 .word 48121559 388 add s10, s10, t3 389 .word 49203031 390 add s11, s11, t4 391 .word 50284503 392 add t0, t0, t5 393 .word 46138967 394 # xor with the bottom-half input 395 .word 797183063 396 sw s5, 32(sp) 397 .word 798264535 398 sw s6, 36(sp) 399 .word 799346007 400 sw s7, 40(sp) 401 .word 800427479 402 sw s8, 44(sp) 403 .word 802590423 404 sw s9, 48(sp) 405 .word 801508951 406 sw s10, 52(sp) 407 .word 803671895 408 sw s11, 56(sp) 409 .word 804753367 410 sw t0, 60(sp) 411 412 # save the bottom-half of output from v24~v31 413 li t3, 64 414 addi t4, a0, 32 415 .word 3956206631 416 417 # the computed vector parts: `64 * VL` 418 slli t3, t1, 6 419 420 add a1, a1, t3 421 add a0, a0, t3 422 sub a2, a2, t3 423 add t2, t2, t1 424 425 # process the scalar data block 426 addi t2, t2, 1 427 li t3, 64 428 .word 197549747 429 sub a2, a2, t4 430 mv t5, sp 431 .Lscalar_data_loop: 432 .word 205452119 433 # from this on, vector registers are grouped with lmul = 8 434 .word 33915911 435 .word 34539527 436 .word 780665943 437 .word 33883175 438 add a1, a1, t1 439 add a0, a0, t1 440 add t5, t5, t1 441 sub t4, t4, t1 442 bnez t4, .Lscalar_data_loop 443 444 bnez a2, .Lblock_loop 445 446 addi sp, sp, 64 447 ld s0, 0(sp) 448 ld s1, 8(sp) 449 ld s2, 16(sp) 450 ld s3, 24(sp) 451 ld s4, 32(sp) 452 ld s5, 40(sp) 453 ld s6, 48(sp) 454 ld s7, 56(sp) 455 ld s8, 64(sp) 456 ld s9, 72(sp) 457 ld s10, 80(sp) 458 ld s11, 88(sp) 459 addi sp, sp, 96 460 461 ret 462 .size ChaCha20_ctr32_v_zbb_zvkb,.-ChaCha20_ctr32_v_zbb_zvkb 463