1 #include <machine/asm.h> 2 .text 3 .p2align 3 4 .globl ChaCha20_ctr32_v_zbb 5 .type ChaCha20_ctr32_v_zbb,@function 6 ChaCha20_ctr32_v_zbb: 7 addi sp, sp, -96 8 sd s0, 0(sp) 9 sd s1, 8(sp) 10 sd s2, 16(sp) 11 sd s3, 24(sp) 12 sd s4, 32(sp) 13 sd s5, 40(sp) 14 sd s6, 48(sp) 15 sd s7, 56(sp) 16 sd s8, 64(sp) 17 sd s9, 72(sp) 18 sd s10, 80(sp) 19 sd s11, 88(sp) 20 addi sp, sp, -64 21 22 lw t2, 0(a4) 23 24 .Lblock_loop: 25 # We will use the scalar ALU for 1 chacha block. 26 srli t3, a2, 6 27 .word 219050839 28 slli t4, t1, 6 29 bltu t4, a2, 1f 30 # Since there is no more chacha block existed, we need to split 1 block 31 # from vector ALU. 32 addi t4, t1, -1 33 .word 219083607 34 1: 35 36 #### chacha block data 37 # init chacha const states into v0~v3 38 # "expa" little endian 39 li a5, 0x61707865 40 .word 1577566295 41 # "nd 3" little endian 42 li a6, 0x3320646e 43 .word 1577599191 44 # "2-by" little endian 45 li a7, 0x79622d32 46 .word 1577632087 47 # "te k" little endian 48 li s0, 0x6b206574 49 lw s1, 0(a3) 50 .word 1577337303 51 52 # init chacha key states into v4~v11 53 lw s2, 4(a3) 54 .word 1577370199 55 lw s3, 8(a3) 56 .word 1577665239 57 lw s4, 12(a3) 58 .word 1577698135 59 lw s5, 16(a3) 60 .word 1577731031 61 lw s6, 20(a3) 62 .word 1577763927 63 lw s7, 24(a3) 64 .word 1577796823 65 lw s8, 28(a3) 66 .word 1577829719 67 .word 1577862615 68 69 # init chacha key states into v12~v13 70 lw s10, 4(a4) 71 .word 1376298583 72 lw s11, 8(a4) 73 .word 46384727 74 lw t0, 12(a4) 75 .word 1577928407 76 add s9, t2, t1 77 78 # init chacha nonce states into v14~v15 79 .word 1577961303 80 .word 1577240535 81 82 li t3, 64 83 # load the top-half of input data into v16~v23 84 .word 3955615751 85 86 # till now in block_loop, we used: 87 # - v0~v15 for chacha states. 88 # - v16~v23 for top-half of input data. 89 # - v24~v31 haven't been used yet. 90 91 # 20 round groups 92 li t3, 10 93 .Lround_loop: 94 # we can use v24~v31 as temporary registers in round_loop. 95 addi t3, t3, -1 96 # a += b; d ^= a; d <<<= 16; 97 .word 33685591 98 add a5, a5, s1 99 .word 34767063 100 add a6, a6, s2 101 .word 35848535 102 add a7, a7, s3 103 .word 36930007 104 add s0, s0, s4 105 .word 784336471 106 xor s9, s9, a5 107 .word 785417943 108 xor s10, s10, a6 109 .word 786499415 110 xor s11, s11, a7 111 .word 787580887 112 xor t0, t0, s0 113 .word 2529705047 114 .word 2530753751 115 .word 2531802455 116 .word 2532851159 117 .word 2731030103 118 .word 2732078807 119 .word 2733127511 120 .word 2734176215 121 .word 718014039 122 .word 1628232859 123 .word 719095511 124 .word 1628265755 125 .word 720176983 126 .word 1628298651 127 .word 721258455 128 .word 1627574939 129 130 # c += d; b ^= c; b <<<= 12; 131 .word 42337367 132 add s5, s5, s9 133 .word 43418839 134 add s6, s6, s10 135 .word 44500311 136 add s7, s7, s11 137 .word 45581783 138 add s8, s8, t0 139 .word 776208983 140 xor s1, s1, s5 141 .word 777290455 142 xor s2, s2, s6 143 .word 778371927 144 xor s3, s3, s7 145 .word 779453399 146 xor s4, s4, s8 147 .word 2521185367 148 .word 2522234071 149 .word 2523282775 150 .word 2524331479 151 .word 2722771543 152 .word 2723820247 153 .word 2724868951 154 .word 2725917655 155 .word 709624407 156 .word 1631900827 157 .word 710705879 158 .word 1632196891 159 .word 711787351 160 .word 1632229787 161 .word 712868823 162 .word 1632262683 163 164 # a += b; d ^= a; d <<<= 8; 165 .word 33685591 166 add a5, a5, s1 167 .word 34767063 168 add a6, a6, s2 169 .word 35848535 170 add a7, a7, s3 171 .word 36930007 172 add s0, s0, s4 173 .word 784336471 174 xor s9, s9, a5 175 .word 785417943 176 xor s10, s10, a6 177 .word 786499415 178 xor s11, s11, a7 179 .word 787580887 180 xor t0, t0, s0 181 .word 2529442903 182 .word 2530491607 183 .word 2531540311 184 .word 2532589015 185 .word 2731292247 186 .word 2732340951 187 .word 2733389655 188 .word 2734438359 189 .word 718014039 190 .word 1636621467 191 .word 719095511 192 .word 1636654363 193 .word 720176983 194 .word 1636687259 195 .word 721258455 196 .word 1635963547 197 198 # c += d; b ^= c; b <<<= 7; 199 .word 42337367 200 add s5, s5, s9 201 .word 43418839 202 add s6, s6, s10 203 .word 44500311 204 add s7, s7, s11 205 .word 45581783 206 add s8, s8, t0 207 .word 776208983 208 xor s1, s1, s5 209 .word 777290455 210 xor s2, s2, s6 211 .word 778371927 212 xor s3, s3, s7 213 .word 779453399 214 xor s4, s4, s8 215 .word 2521021527 216 .word 2522070231 217 .word 2523118935 218 .word 2524167639 219 .word 2722935383 220 .word 2723984087 221 .word 2725032791 222 .word 2726081495 223 .word 709624407 224 .word 1637143707 225 .word 710705879 226 .word 1637439771 227 .word 711787351 228 .word 1637472667 229 .word 712868823 230 .word 1637505563 231 232 233 # a += b; d ^= a; d <<<= 16; 234 .word 36831703 235 add s0, s0, s1 236 .word 33718359 237 add a5, a5, s2 238 .word 34799831 239 add a6, a6, s3 240 .word 35881303 241 add a7, a7, s4 242 .word 786532183 243 xor s11, s11, s0 244 .word 787482583 245 xor t0, t0, a5 246 .word 784369239 247 xor s9, s9, a6 248 .word 785450711 249 xor s10, s10, a7 250 .word 2531802199 251 .word 2532850903 252 .word 2529705303 253 .word 2530754007 254 .word 2733127511 255 .word 2734176215 256 .word 2731030103 257 .word 2732078807 258 .word 720111447 259 .word 1628298651 260 .word 721192919 261 .word 1627574939 262 .word 718079575 263 .word 1628232859 264 .word 719161047 265 .word 1628265755 266 267 # c += d; b ^= c; b <<<= 12; 268 .word 43451607 269 add s6, s6, s11 270 .word 44533079 271 add s7, s7, t0 272 .word 45483479 273 add s8, s8, s9 274 .word 42370135 275 add s5, s5, s10 276 .word 776241751 277 xor s1, s1, s6 278 .word 777323223 279 xor s2, s2, s7 280 .word 778404695 281 xor s3, s3, s8 282 .word 779355095 283 xor s4, s4, s5 284 .word 2521185367 285 .word 2522234071 286 .word 2523282775 287 .word 2524331479 288 .word 2722771543 289 .word 2723820247 290 .word 2724868951 291 .word 2725917655 292 .word 709624407 293 .word 1631900827 294 .word 710705879 295 .word 1632196891 296 .word 711787351 297 .word 1632229787 298 .word 712868823 299 .word 1632262683 300 301 # a += b; d ^= a; d <<<= 8; 302 .word 36831703 303 add s0, s0, s1 304 .word 33718359 305 add a5, a5, s2 306 .word 34799831 307 add a6, a6, s3 308 .word 35881303 309 add a7, a7, s4 310 .word 786532183 311 xor s11, s11, s0 312 .word 787482583 313 xor t0, t0, a5 314 .word 784369239 315 xor s9, s9, a6 316 .word 785450711 317 xor s10, s10, a7 318 .word 2531540055 319 .word 2532588759 320 .word 2529443159 321 .word 2530491863 322 .word 2733389655 323 .word 2734438359 324 .word 2731292247 325 .word 2732340951 326 .word 720111447 327 .word 1636687259 328 .word 721192919 329 .word 1635963547 330 .word 718079575 331 .word 1636621467 332 .word 719161047 333 .word 1636654363 334 335 # c += d; b ^= c; b <<<= 7; 336 .word 43451607 337 add s6, s6, s11 338 .word 44533079 339 add s7, s7, t0 340 .word 45483479 341 add s8, s8, s9 342 .word 42370135 343 add s5, s5, s10 344 .word 776241751 345 xor s1, s1, s6 346 .word 777323223 347 xor s2, s2, s7 348 .word 778404695 349 xor s3, s3, s8 350 .word 779355095 351 xor s4, s4, s5 352 .word 2521021527 353 .word 2522070231 354 .word 2523118935 355 .word 2524167639 356 .word 2722935383 357 .word 2723984087 358 .word 2725032791 359 .word 2726081495 360 .word 709624407 361 .word 1637143707 362 .word 710705879 363 .word 1637439771 364 .word 711787351 365 .word 1637472667 366 .word 712868823 367 .word 1637505563 368 369 370 bnez t3, .Lround_loop 371 372 li t3, 64 373 # load the bottom-half of input data into v24~v31 374 addi t4, a1, 32 375 .word 3956206599 376 377 # now, there are no free vector registers until the round_loop exits. 378 379 # add chacha top-half initial block states 380 # "expa" little endian 381 li t3, 0x61707865 382 .word 34488407 383 add a5, a5, t3 384 # "nd 3" little endian 385 li t4, 0x3320646e 386 .word 35569879 387 add a6, a6, t4 388 lw t3, 0(a3) 389 # "2-by" little endian 390 li t5, 0x79622d32 391 .word 36651351 392 add a7, a7, t5 393 lw t4, 4(a3) 394 # "te k" little endian 395 li t6, 0x6b206574 396 .word 37732823 397 add s0, s0, t6 398 lw t5, 8(a3) 399 .word 38683223 400 add s1, s1, t3 401 lw t6, 12(a3) 402 .word 39764695 403 add s2, s2, t4 404 .word 40846167 405 add s3, s3, t5 406 .word 41927639 407 add s4, s4, t6 408 409 # xor with the top-half input 410 .word 788531287 411 sw a5, 0(sp) 412 sw a6, 4(sp) 413 .word 789612759 414 sw a7, 8(sp) 415 sw s0, 12(sp) 416 .word 790694231 417 sw s1, 16(sp) 418 sw s2, 20(sp) 419 .word 791775703 420 sw s3, 24(sp) 421 sw s4, 28(sp) 422 .word 792857175 423 lw t3, 16(a3) 424 .word 793938647 425 lw t4, 20(a3) 426 .word 795020119 427 lw t5, 24(a3) 428 .word 796101591 429 430 # save the top-half of output from v16~v23 431 li t6, 64 432 .word 3958728743 433 434 # add chacha bottom-half initial block states 435 .word 42878039 436 add s5, s5, t3 437 lw t6, 28(a3) 438 .word 43959511 439 add s6, s6, t4 440 lw t3, 4(a4) 441 .word 45040983 442 add s7, s7, t5 443 lw t4, 8(a4) 444 .word 46122455 445 add s8, s8, t6 446 lw t5, 12(a4) 447 .word 1376297047 448 add s9, s9, t2 449 .word 46384727 450 add s9, s9, t1 451 .word 48121559 452 add s10, s10, t3 453 .word 49203031 454 add s11, s11, t4 455 .word 50284503 456 add t0, t0, t5 457 .word 46138967 458 # xor with the bottom-half input 459 .word 797183063 460 sw s5, 32(sp) 461 .word 798264535 462 sw s6, 36(sp) 463 .word 799346007 464 sw s7, 40(sp) 465 .word 800427479 466 sw s8, 44(sp) 467 .word 802590423 468 sw s9, 48(sp) 469 .word 801508951 470 sw s10, 52(sp) 471 .word 803671895 472 sw s11, 56(sp) 473 .word 804753367 474 sw t0, 60(sp) 475 476 # save the bottom-half of output from v24~v31 477 li t3, 64 478 addi t4, a0, 32 479 .word 3956206631 480 481 # the computed vector parts: `64 * VL` 482 slli t3, t1, 6 483 484 add a1, a1, t3 485 add a0, a0, t3 486 sub a2, a2, t3 487 add t2, t2, t1 488 489 # process the scalar data block 490 addi t2, t2, 1 491 li t3, 64 492 .word 197549747 493 sub a2, a2, t4 494 mv t5, sp 495 .Lscalar_data_loop: 496 .word 205452119 497 # from this on, vector registers are grouped with lmul = 8 498 .word 33915911 499 .word 34539527 500 .word 780665943 501 .word 33883175 502 add a1, a1, t1 503 add a0, a0, t1 504 add t5, t5, t1 505 sub t4, t4, t1 506 bnez t4, .Lscalar_data_loop 507 508 bnez a2, .Lblock_loop 509 510 addi sp, sp, 64 511 ld s0, 0(sp) 512 ld s1, 8(sp) 513 ld s2, 16(sp) 514 ld s3, 24(sp) 515 ld s4, 32(sp) 516 ld s5, 40(sp) 517 ld s6, 48(sp) 518 ld s7, 56(sp) 519 ld s8, 64(sp) 520 ld s9, 72(sp) 521 ld s10, 80(sp) 522 ld s11, 88(sp) 523 addi sp, sp, 96 524 525 ret 526 .size ChaCha20_ctr32_v_zbb,.-ChaCha20_ctr32_v_zbb 527