1 1.1 christos #include "arm_asm.h" 2 1.1 christos #include "arm_arch.h" 3 1.1 christos .arch armv8-a 4 1.1 christos .section .rodata 5 1.1 christos 6 1.1 christos .align 5 7 1.1 christos // The polynomial p 8 1.1 christos .Lpoly: 9 1.1 christos .quad 0xffffffffffffffff,0xffffffff00000000,0xffffffffffffffff,0xfffffffeffffffff 10 1.1 christos // The order of polynomial n 11 1.1 christos .Lord: 12 1.1 christos .quad 0x53bbf40939d54123,0x7203df6b21c6052b,0xffffffffffffffff,0xfffffffeffffffff 13 1.1 christos // (p + 1) / 2 14 1.1 christos .Lpoly_div_2: 15 1.1 christos .quad 0x8000000000000000,0xffffffff80000000,0xffffffffffffffff,0x7fffffff7fffffff 16 1.1 christos // (n + 1) / 2 17 1.1 christos .Lord_div_2: 18 1.1 christos .quad 0xa9ddfa049ceaa092,0xb901efb590e30295,0xffffffffffffffff,0x7fffffff7fffffff 19 1.1 christos 20 1.1 christos .text 21 1.1 christos 22 1.1 christos // void bn_rshift1(BN_ULONG *a); 23 1.1 christos .globl bn_rshift1 24 1.1 christos .type bn_rshift1,%function 25 1.1 christos .align 5 26 1.1 christos bn_rshift1: 27 1.1 christos AARCH64_VALID_CALL_TARGET 28 1.1 christos // Load inputs 29 1.1 christos ldp x7,x8,[x0] 30 1.1 christos ldp x9,x10,[x0,#16] 31 1.1 christos 32 1.1 christos // Right shift 33 1.1 christos extr x7,x8,x7,#1 34 1.1 christos extr x8,x9,x8,#1 35 1.1 christos extr x9,x10,x9,#1 36 1.1 christos lsr x10,x10,#1 37 1.1 christos 38 1.1 christos // Store results 39 1.1 christos stp x7,x8,[x0] 40 1.1 christos stp x9,x10,[x0,#16] 41 1.1 christos 42 1.1 christos ret 43 1.1 christos .size bn_rshift1,.-bn_rshift1 44 1.1 christos 45 1.1 christos // void bn_sub(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b); 46 1.1 christos .globl bn_sub 47 1.1 christos .type bn_sub,%function 48 1.1 christos .align 5 49 1.1 christos bn_sub: 50 1.1 christos AARCH64_VALID_CALL_TARGET 51 1.1 christos // Load inputs 52 1.1 christos ldp x7,x8,[x1] 53 1.1 christos ldp x9,x10,[x1,#16] 54 1.1 christos ldp x11,x12,[x2] 55 1.1 christos ldp x13,x14,[x2,#16] 56 1.1 christos 57 1.1 christos // Subtraction 58 1.1 christos subs x7,x7,x11 59 1.1 christos sbcs x8,x8,x12 60 1.1 christos sbcs x9,x9,x13 61 1.1 christos sbc x10,x10,x14 62 1.1 christos 63 1.1 christos // Store results 64 1.1 christos stp x7,x8,[x0] 65 1.1 christos stp x9,x10,[x0,#16] 66 1.1 christos 67 1.1 christos ret 68 1.1 christos .size bn_sub,.-bn_sub 69 1.1 christos 70 1.1 christos // void ecp_sm2p256_div_by_2(BN_ULONG *r,const BN_ULONG *a); 71 1.1 christos .globl ecp_sm2p256_div_by_2 72 1.1 christos .type ecp_sm2p256_div_by_2,%function 73 1.1 christos .align 5 74 1.1 christos ecp_sm2p256_div_by_2: 75 1.1 christos AARCH64_VALID_CALL_TARGET 76 1.1 christos // Load inputs 77 1.1 christos ldp x7,x8,[x1] 78 1.1 christos ldp x9,x10,[x1,#16] 79 1.1 christos 80 1.1 christos // Save the least significant bit 81 1.1 christos mov x3,x7 82 1.1 christos 83 1.1 christos // Right shift 1 84 1.1 christos extr x7,x8,x7,#1 85 1.1 christos extr x8,x9,x8,#1 86 1.1 christos extr x9,x10,x9,#1 87 1.1 christos lsr x10,x10,#1 88 1.1 christos 89 1.1 christos // Load mod 90 1.1 christos adrp x2,.Lpoly_div_2 91 1.1 christos add x2,x2,#:lo12:.Lpoly_div_2 92 1.1 christos ldp x11,x12,[x2] 93 1.1 christos ldp x13,x14,[x2,#16] 94 1.1 christos 95 1.1 christos // Parity check 96 1.1 christos tst x3,#1 97 1.1 christos csel x11,xzr,x11,eq 98 1.1 christos csel x12,xzr,x12,eq 99 1.1 christos csel x13,xzr,x13,eq 100 1.1 christos csel x14,xzr,x14,eq 101 1.1 christos 102 1.1 christos // Add 103 1.1 christos adds x7,x7,x11 104 1.1 christos adcs x8,x8,x12 105 1.1 christos adcs x9,x9,x13 106 1.1 christos adc x10,x10,x14 107 1.1 christos 108 1.1 christos // Store results 109 1.1 christos stp x7,x8,[x0] 110 1.1 christos stp x9,x10,[x0,#16] 111 1.1 christos ret 112 1.1 christos .size ecp_sm2p256_div_by_2,.-ecp_sm2p256_div_by_2 113 1.1 christos 114 1.1 christos // void ecp_sm2p256_div_by_2_mod_ord(BN_ULONG *r,const BN_ULONG *a); 115 1.1 christos .globl ecp_sm2p256_div_by_2_mod_ord 116 1.1 christos .type ecp_sm2p256_div_by_2_mod_ord,%function 117 1.1 christos .align 5 118 1.1 christos ecp_sm2p256_div_by_2_mod_ord: 119 1.1 christos AARCH64_VALID_CALL_TARGET 120 1.1 christos // Load inputs 121 1.1 christos ldp x7,x8,[x1] 122 1.1 christos ldp x9,x10,[x1,#16] 123 1.1 christos 124 1.1 christos // Save the least significant bit 125 1.1 christos mov x3,x7 126 1.1 christos 127 1.1 christos // Right shift 1 128 1.1 christos extr x7,x8,x7,#1 129 1.1 christos extr x8,x9,x8,#1 130 1.1 christos extr x9,x10,x9,#1 131 1.1 christos lsr x10,x10,#1 132 1.1 christos 133 1.1 christos // Load mod 134 1.1 christos adrp x2,.Lord_div_2 135 1.1 christos add x2,x2,#:lo12:.Lord_div_2 136 1.1 christos ldp x11,x12,[x2] 137 1.1 christos ldp x13,x14,[x2,#16] 138 1.1 christos 139 1.1 christos // Parity check 140 1.1 christos tst x3,#1 141 1.1 christos csel x11,xzr,x11,eq 142 1.1 christos csel x12,xzr,x12,eq 143 1.1 christos csel x13,xzr,x13,eq 144 1.1 christos csel x14,xzr,x14,eq 145 1.1 christos 146 1.1 christos // Add 147 1.1 christos adds x7,x7,x11 148 1.1 christos adcs x8,x8,x12 149 1.1 christos adcs x9,x9,x13 150 1.1 christos adc x10,x10,x14 151 1.1 christos 152 1.1 christos // Store results 153 1.1 christos stp x7,x8,[x0] 154 1.1 christos stp x9,x10,[x0,#16] 155 1.1 christos ret 156 1.1 christos .size ecp_sm2p256_div_by_2_mod_ord,.-ecp_sm2p256_div_by_2_mod_ord 157 1.1 christos 158 1.1 christos // void ecp_sm2p256_mul_by_3(BN_ULONG *r,const BN_ULONG *a); 159 1.1 christos .globl ecp_sm2p256_mul_by_3 160 1.1 christos .type ecp_sm2p256_mul_by_3,%function 161 1.1 christos .align 5 162 1.1 christos ecp_sm2p256_mul_by_3: 163 1.1 christos AARCH64_VALID_CALL_TARGET 164 1.1 christos // Load inputs 165 1.1 christos ldp x7,x8,[x1] 166 1.1 christos ldp x9,x10,[x1,#16] 167 1.1 christos 168 1.1 christos // 2*a 169 1.1 christos adds x7,x7,x7 170 1.1 christos adcs x8,x8,x8 171 1.1 christos adcs x9,x9,x9 172 1.1 christos adcs x10,x10,x10 173 1.1 christos adcs x15,xzr,xzr 174 1.1 christos 175 1.1 christos mov x3,x7 176 1.1 christos mov x4,x8 177 1.1 christos mov x5,x9 178 1.1 christos mov x6,x10 179 1.1 christos 180 1.1 christos // Sub polynomial 181 1.1 christos adrp x2,.Lpoly 182 1.1 christos add x2,x2,#:lo12:.Lpoly 183 1.1 christos ldp x11,x12,[x2] 184 1.1 christos ldp x13,x14,[x2,#16] 185 1.1 christos subs x7,x7,x11 186 1.1 christos sbcs x8,x8,x12 187 1.1 christos sbcs x9,x9,x13 188 1.1 christos sbcs x10,x10,x14 189 1.1 christos sbcs x15,x15,xzr 190 1.1 christos 191 1.1 christos csel x7,x7,x3,cs 192 1.1 christos csel x8,x8,x4,cs 193 1.1 christos csel x9,x9,x5,cs 194 1.1 christos csel x10,x10,x6,cs 195 1.1 christos eor x15,x15,x15 196 1.1 christos 197 1.1 christos // 3*a 198 1.1 christos ldp x11,x12,[x1] 199 1.1 christos ldp x13,x14,[x1,#16] 200 1.1 christos adds x7,x7,x11 201 1.1 christos adcs x8,x8,x12 202 1.1 christos adcs x9,x9,x13 203 1.1 christos adcs x10,x10,x14 204 1.1 christos adcs x15,xzr,xzr 205 1.1 christos 206 1.1 christos mov x3,x7 207 1.1 christos mov x4,x8 208 1.1 christos mov x5,x9 209 1.1 christos mov x6,x10 210 1.1 christos 211 1.1 christos // Sub polynomial 212 1.1 christos adrp x2,.Lpoly 213 1.1 christos add x2,x2,#:lo12:.Lpoly 214 1.1 christos ldp x11,x12,[x2] 215 1.1 christos ldp x13,x14,[x2,#16] 216 1.1 christos subs x7,x7,x11 217 1.1 christos sbcs x8,x8,x12 218 1.1 christos sbcs x9,x9,x13 219 1.1 christos sbcs x10,x10,x14 220 1.1 christos sbcs x15,x15,xzr 221 1.1 christos 222 1.1 christos csel x7,x7,x3,cs 223 1.1 christos csel x8,x8,x4,cs 224 1.1 christos csel x9,x9,x5,cs 225 1.1 christos csel x10,x10,x6,cs 226 1.1 christos 227 1.1 christos // Store results 228 1.1 christos stp x7,x8,[x0] 229 1.1 christos stp x9,x10,[x0,#16] 230 1.1 christos 231 1.1 christos ret 232 1.1 christos .size ecp_sm2p256_mul_by_3,.-ecp_sm2p256_mul_by_3 233 1.1 christos 234 1.1 christos // void ecp_sm2p256_add(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b); 235 1.1 christos .globl ecp_sm2p256_add 236 1.1 christos .type ecp_sm2p256_add,%function 237 1.1 christos .align 5 238 1.1 christos ecp_sm2p256_add: 239 1.1 christos AARCH64_VALID_CALL_TARGET 240 1.1 christos // Load inputs 241 1.1 christos ldp x7,x8,[x1] 242 1.1 christos ldp x9,x10,[x1,#16] 243 1.1 christos ldp x11,x12,[x2] 244 1.1 christos ldp x13,x14,[x2,#16] 245 1.1 christos 246 1.1 christos // Addition 247 1.1 christos adds x7,x7,x11 248 1.1 christos adcs x8,x8,x12 249 1.1 christos adcs x9,x9,x13 250 1.1 christos adcs x10,x10,x14 251 1.1 christos adc x15,xzr,xzr 252 1.1 christos 253 1.1 christos // Load polynomial 254 1.1 christos adrp x2,.Lpoly 255 1.1 christos add x2,x2,#:lo12:.Lpoly 256 1.1 christos ldp x11,x12,[x2] 257 1.1 christos ldp x13,x14,[x2,#16] 258 1.1 christos 259 1.1 christos // Backup Addition 260 1.1 christos mov x3,x7 261 1.1 christos mov x4,x8 262 1.1 christos mov x5,x9 263 1.1 christos mov x6,x10 264 1.1 christos 265 1.1 christos // Sub polynomial 266 1.1 christos subs x3,x3,x11 267 1.1 christos sbcs x4,x4,x12 268 1.1 christos sbcs x5,x5,x13 269 1.1 christos sbcs x6,x6,x14 270 1.1 christos sbcs x15,x15,xzr 271 1.1 christos 272 1.1 christos // Select based on carry 273 1.1 christos csel x7,x7,x3,cc 274 1.1 christos csel x8,x8,x4,cc 275 1.1 christos csel x9,x9,x5,cc 276 1.1 christos csel x10,x10,x6,cc 277 1.1 christos 278 1.1 christos // Store results 279 1.1 christos stp x7,x8,[x0] 280 1.1 christos stp x9,x10,[x0,#16] 281 1.1 christos ret 282 1.1 christos .size ecp_sm2p256_add,.-ecp_sm2p256_add 283 1.1 christos 284 1.1 christos // void ecp_sm2p256_sub(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b); 285 1.1 christos .globl ecp_sm2p256_sub 286 1.1 christos .type ecp_sm2p256_sub,%function 287 1.1 christos .align 5 288 1.1 christos ecp_sm2p256_sub: 289 1.1 christos AARCH64_VALID_CALL_TARGET 290 1.1 christos // Load inputs 291 1.1 christos ldp x7,x8,[x1] 292 1.1 christos ldp x9,x10,[x1,#16] 293 1.1 christos ldp x11,x12,[x2] 294 1.1 christos ldp x13,x14,[x2,#16] 295 1.1 christos 296 1.1 christos // Subtraction 297 1.1 christos subs x7,x7,x11 298 1.1 christos sbcs x8,x8,x12 299 1.1 christos sbcs x9,x9,x13 300 1.1 christos sbcs x10,x10,x14 301 1.1 christos sbc x15,xzr,xzr 302 1.1 christos 303 1.1 christos // Load polynomial 304 1.1 christos adrp x2,.Lpoly 305 1.1 christos add x2,x2,#:lo12:.Lpoly 306 1.1 christos ldp x11,x12,[x2] 307 1.1 christos ldp x13,x14,[x2,#16] 308 1.1 christos 309 1.1 christos // Backup subtraction 310 1.1 christos mov x3,x7 311 1.1 christos mov x4,x8 312 1.1 christos mov x5,x9 313 1.1 christos mov x6,x10 314 1.1 christos 315 1.1 christos // Add polynomial 316 1.1 christos adds x3,x3,x11 317 1.1 christos adcs x4,x4,x12 318 1.1 christos adcs x5,x5,x13 319 1.1 christos adcs x6,x6,x14 320 1.1 christos tst x15,x15 321 1.1 christos 322 1.1 christos // Select based on carry 323 1.1 christos csel x7,x7,x3,eq 324 1.1 christos csel x8,x8,x4,eq 325 1.1 christos csel x9,x9,x5,eq 326 1.1 christos csel x10,x10,x6,eq 327 1.1 christos 328 1.1 christos // Store results 329 1.1 christos stp x7,x8,[x0] 330 1.1 christos stp x9,x10,[x0,#16] 331 1.1 christos ret 332 1.1 christos .size ecp_sm2p256_sub,.-ecp_sm2p256_sub 333 1.1 christos 334 1.1 christos // void ecp_sm2p256_sub_mod_ord(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b); 335 1.1 christos .globl ecp_sm2p256_sub_mod_ord 336 1.1 christos .type ecp_sm2p256_sub_mod_ord,%function 337 1.1 christos .align 5 338 1.1 christos ecp_sm2p256_sub_mod_ord: 339 1.1 christos AARCH64_VALID_CALL_TARGET 340 1.1 christos // Load inputs 341 1.1 christos ldp x7,x8,[x1] 342 1.1 christos ldp x9,x10,[x1,#16] 343 1.1 christos ldp x11,x12,[x2] 344 1.1 christos ldp x13,x14,[x2,#16] 345 1.1 christos 346 1.1 christos // Subtraction 347 1.1 christos subs x7,x7,x11 348 1.1 christos sbcs x8,x8,x12 349 1.1 christos sbcs x9,x9,x13 350 1.1 christos sbcs x10,x10,x14 351 1.1 christos sbc x15,xzr,xzr 352 1.1 christos 353 1.1 christos // Load polynomial 354 1.1 christos adrp x2,.Lord 355 1.1 christos add x2,x2,#:lo12:.Lord 356 1.1 christos ldp x11,x12,[x2] 357 1.1 christos ldp x13,x14,[x2,#16] 358 1.1 christos 359 1.1 christos // Backup subtraction 360 1.1 christos mov x3,x7 361 1.1 christos mov x4,x8 362 1.1 christos mov x5,x9 363 1.1 christos mov x6,x10 364 1.1 christos 365 1.1 christos // Add polynomial 366 1.1 christos adds x3,x3,x11 367 1.1 christos adcs x4,x4,x12 368 1.1 christos adcs x5,x5,x13 369 1.1 christos adcs x6,x6,x14 370 1.1 christos tst x15,x15 371 1.1 christos 372 1.1 christos // Select based on carry 373 1.1 christos csel x7,x7,x3,eq 374 1.1 christos csel x8,x8,x4,eq 375 1.1 christos csel x9,x9,x5,eq 376 1.1 christos csel x10,x10,x6,eq 377 1.1 christos 378 1.1 christos // Store results 379 1.1 christos stp x7,x8,[x0] 380 1.1 christos stp x9,x10,[x0,#16] 381 1.1 christos ret 382 1.1 christos .size ecp_sm2p256_sub_mod_ord,.-ecp_sm2p256_sub_mod_ord 383 1.1 christos 384 1.1 christos .macro RDC 385 1.1 christos // a = | s7 | ... | s0 |, where si are 64-bit quantities 386 1.1 christos // = |a15|a14| ... |a1|a0|, where ai are 32-bit quantities 387 1.1 christos // | s7 | s6 | s5 | s4 | 388 1.1 christos // | a15 | a14 | a13 | a12 | a11 | a10 | a9 | a8 | 389 1.1 christos // | s3 | s2 | s1 | s0 | 390 1.1 christos // | a7 | a6 | a5 | a4 | a3 | a2 | a1 | a0 | 391 1.1 christos // ================================================= 392 1.1 christos // | a8 | a11 | a10 | a9 | a8 | 0 | s4 | (+) 393 1.1 christos // | a9 | a15 | s6 | a11 | 0 | a10 | a9 | (+) 394 1.1 christos // | a10 | 0 | a14 | a13 | a12 | 0 | s5 | (+) 395 1.1 christos // | a11 | 0 | s7 | a13 | 0 | a12 | a11 | (+) 396 1.1 christos // | a12 | 0 | s7 | a13 | 0 | s6 | (+) 397 1.1 christos // | a12 | 0 | 0 | a15 | a14 | 0 | a14 | a13 | (+) 398 1.1 christos // | a13 | 0 | 0 | 0 | a15 | 0 | a14 | a13 | (+) 399 1.1 christos // | a13 | 0 | 0 | 0 | 0 | 0 | s7 | (+) 400 1.1 christos // | a14 | 0 | 0 | 0 | 0 | 0 | s7 | (+) 401 1.1 christos // | a14 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+) 402 1.1 christos // | a15 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+) 403 1.1 christos // | a15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (+) 404 1.1 christos // | s7 | 0 | 0 | 0 | 0 | 0 | 0 | (+) 405 1.1 christos // | 0 | 0 | 0 | 0 | 0 | a8 | 0 | 0 | (-) 406 1.1 christos // | 0 | 0 | 0 | 0 | 0 | a9 | 0 | 0 | (-) 407 1.1 christos // | 0 | 0 | 0 | 0 | 0 | a13 | 0 | 0 | (-) 408 1.1 christos // | 0 | 0 | 0 | 0 | 0 | a14 | 0 | 0 | (-) 409 1.1 christos // | U[7]| U[6]| U[5]| U[4]| U[3]| U[2]| U[1]| U[0]| 410 1.1 christos // | V[3] | V[2] | V[1] | V[0] | 411 1.1 christos 412 1.1 christos // 1. 64-bit addition 413 1.1 christos // t2=s6+s7+s7 414 1.1 christos adds x5,x13,x14 415 1.1 christos adcs x4,xzr,xzr 416 1.1 christos adds x5,x5,x14 417 1.1 christos adcs x4,x4,xzr 418 1.1 christos // t3=s4+s5+t2 419 1.1 christos adds x6,x11,x5 420 1.1 christos adcs x15,x4,xzr 421 1.1 christos adds x6,x6,x12 422 1.1 christos adcs x15,x15,xzr 423 1.1 christos // sum 424 1.1 christos adds x7,x7,x6 425 1.1 christos adcs x8,x8,x15 426 1.1 christos adcs x9,x9,x5 427 1.1 christos adcs x10,x10,x14 428 1.1 christos adcs x3,xzr,xzr 429 1.1 christos adds x10,x10,x4 430 1.1 christos adcs x3,x3,xzr 431 1.1 christos 432 1.1 christos stp x7,x8,[sp,#32] 433 1.1 christos stp x9,x10,[sp,#48] 434 1.1 christos 435 1.1 christos // 2. 64-bit to 32-bit spread 436 1.1 christos mov x4,#0xffffffff 437 1.1 christos mov x7,x11 438 1.1 christos mov x8,x12 439 1.1 christos mov x9,x13 440 1.1 christos mov x10,x14 441 1.1 christos and x7,x7,x4 // a8 442 1.1 christos and x8,x8,x4 // a10 443 1.1 christos and x9,x9,x4 // a12 444 1.1 christos and x10,x10,x4 // a14 445 1.1 christos lsr x11,x11,#32 // a9 446 1.1 christos lsr x12,x12,#32 // a11 447 1.1 christos lsr x13,x13,#32 // a13 448 1.1 christos lsr x14,x14,#32 // a15 449 1.1 christos 450 1.1 christos // 3. 32-bit addition 451 1.1 christos add x4,x10,x9 // t1 <- a12 + a14 452 1.1 christos add x5,x14,x13 // t2 <- a13 + a15 453 1.1 christos add x6,x7,x11 // t3 <- a8 + a9 454 1.1 christos add x15,x10,x8 // t4 <- a10 + a14 455 1.1 christos add x14,x14,x12 // a15 <- a11 + a15 456 1.1 christos add x9,x5,x4 // a12 <- a12 + a13 + a14 + a15 457 1.1 christos add x8,x8,x9 // a10 <- a10 + a12 + a13 + a14 + a15 458 1.1 christos add x8,x8,x9 // a10 <- a10 + 2*(a12 + a13 + a14 + a15) 459 1.1 christos add x8,x8,x6 // a10 <- a8 + a9 + a10 + 2*(a12 + a13 + a14 + a15) 460 1.1 christos add x8,x8,x12 // a10 <- a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15) 461 1.1 christos add x9,x9,x13 // a12 <- a12 + 2*a13 + a14 + a15 462 1.1 christos add x9,x9,x12 // a12 <- a11 + a12 + 2*a13 + a14 + a15 463 1.1 christos add x9,x9,x7 // a12 <- a8 + a11 + a12 + 2*a13 + a14 + a15 464 1.1 christos add x6,x6,x10 // t3 <- a8 + a9 + a14 465 1.1 christos add x6,x6,x13 // t3 <- a8 + a9 + a13 + a14 466 1.1 christos add x11,x11,x5 // a9 <- a9 + a13 + a15 467 1.1 christos add x12,x12,x11 // a11 <- a9 + a11 + a13 + a15 468 1.1 christos add x12,x12,x5 // a11 <- a9 + a11 + 2*(a13 + a15) 469 1.1 christos add x4,x4,x15 // t1 <- a10 + a12 + 2*a14 470 1.1 christos 471 1.1 christos // U[0] s5 a9 + a11 + 2*(a13 + a15) 472 1.1 christos // U[1] t1 a10 + a12 + 2*a14 473 1.1 christos // U[2] -t3 a8 + a9 + a13 + a14 474 1.1 christos // U[3] s2 a8 + a11 + a12 + 2*a13 + a14 + a15 475 1.1 christos // U[4] s4 a9 + a13 + a15 476 1.1 christos // U[5] t4 a10 + a14 477 1.1 christos // U[6] s7 a11 + a15 478 1.1 christos // U[7] s1 a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15) 479 1.1 christos 480 1.1 christos // 4. 32-bit to 64-bit 481 1.1 christos lsl x7,x4,#32 482 1.1 christos extr x4,x9,x4,#32 483 1.1 christos extr x9,x15,x9,#32 484 1.1 christos extr x15,x8,x15,#32 485 1.1 christos lsr x8,x8,#32 486 1.1 christos 487 1.1 christos // 5. 64-bit addition 488 1.1 christos adds x12,x12,x7 489 1.1 christos adcs x4,x4,xzr 490 1.1 christos adcs x11,x11,x9 491 1.1 christos adcs x14,x14,x15 492 1.1 christos adcs x3,x3,x8 493 1.1 christos 494 1.1 christos // V[0] s5 495 1.1 christos // V[1] t1 496 1.1 christos // V[2] s4 497 1.1 christos // V[3] s7 498 1.1 christos // carry t0 499 1.1 christos // sub t3 500 1.1 christos 501 1.1 christos // 5. Process s0-s3 502 1.1 christos ldp x7,x8,[sp,#32] 503 1.1 christos ldp x9,x10,[sp,#48] 504 1.1 christos // add with V0-V3 505 1.1 christos adds x7,x7,x12 506 1.1 christos adcs x8,x8,x4 507 1.1 christos adcs x9,x9,x11 508 1.1 christos adcs x10,x10,x14 509 1.1 christos adcs x3,x3,xzr 510 1.1 christos // sub with t3 511 1.1 christos subs x8,x8,x6 512 1.1 christos sbcs x9,x9,xzr 513 1.1 christos sbcs x10,x10,xzr 514 1.1 christos sbcs x3,x3,xzr 515 1.1 christos 516 1.1 christos // 6. MOD 517 1.1 christos // First Mod 518 1.1 christos lsl x4,x3,#32 519 1.1 christos subs x5,x4,x3 520 1.1 christos 521 1.1 christos adds x7,x7,x3 522 1.1 christos adcs x8,x8,x5 523 1.1 christos adcs x9,x9,xzr 524 1.1 christos adcs x10,x10,x4 525 1.1 christos 526 1.1 christos // Last Mod 527 1.1 christos // return y - p if y > p else y 528 1.1 christos mov x11,x7 529 1.1 christos mov x12,x8 530 1.1 christos mov x13,x9 531 1.1 christos mov x14,x10 532 1.1 christos 533 1.1 christos adrp x3,.Lpoly 534 1.1 christos add x3,x3,#:lo12:.Lpoly 535 1.1 christos ldp x4,x5,[x3] 536 1.1 christos ldp x6,x15,[x3,#16] 537 1.1 christos 538 1.1 christos adcs x16,xzr,xzr 539 1.1 christos 540 1.1 christos subs x7,x7,x4 541 1.1 christos sbcs x8,x8,x5 542 1.1 christos sbcs x9,x9,x6 543 1.1 christos sbcs x10,x10,x15 544 1.1 christos sbcs x16,x16,xzr 545 1.1 christos 546 1.1 christos csel x7,x7,x11,cs 547 1.1 christos csel x8,x8,x12,cs 548 1.1 christos csel x9,x9,x13,cs 549 1.1 christos csel x10,x10,x14,cs 550 1.1 christos 551 1.1 christos .endm 552 1.1 christos 553 1.1 christos // void ecp_sm2p256_mul(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b); 554 1.1 christos .globl ecp_sm2p256_mul 555 1.1 christos .type ecp_sm2p256_mul,%function 556 1.1 christos .align 5 557 1.1 christos ecp_sm2p256_mul: 558 1.1 christos AARCH64_SIGN_LINK_REGISTER 559 1.1 christos // Store scalar registers 560 1.1 christos stp x29,x30,[sp,#-80]! 561 1.1 christos add x29,sp,#0 562 1.1 christos stp x16,x17,[sp,#16] 563 1.1 christos stp x19,x20,[sp,#64] 564 1.1 christos 565 1.1 christos // Load inputs 566 1.1 christos ldp x7,x8,[x1] 567 1.1 christos ldp x9,x10,[x1,#16] 568 1.1 christos ldp x11,x12,[x2] 569 1.1 christos ldp x13,x14,[x2,#16] 570 1.1 christos 571 1.1 christos // ### multiplication ### 572 1.1 christos // ======================== 573 1.1 christos // s3 s2 s1 s0 574 1.1 christos // * s7 s6 s5 s4 575 1.1 christos // ------------------------ 576 1.1 christos // + s0 s0 s0 s0 577 1.1 christos // * * * * 578 1.1 christos // s7 s6 s5 s4 579 1.1 christos // s1 s1 s1 s1 580 1.1 christos // * * * * 581 1.1 christos // s7 s6 s5 s4 582 1.1 christos // s2 s2 s2 s2 583 1.1 christos // * * * * 584 1.1 christos // s7 s6 s5 s4 585 1.1 christos // s3 s3 s3 s3 586 1.1 christos // * * * * 587 1.1 christos // s7 s6 s5 s4 588 1.1 christos // ------------------------ 589 1.1 christos // s7 s6 s5 s4 s3 s2 s1 s0 590 1.1 christos // ======================== 591 1.1 christos 592 1.1 christos // ### s0*s4 ### 593 1.1 christos mul x16,x7,x11 594 1.1 christos umulh x5,x7,x11 595 1.1 christos 596 1.1 christos // ### s1*s4 + s0*s5 ### 597 1.1 christos mul x3,x8,x11 598 1.1 christos umulh x4,x8,x11 599 1.1 christos adds x5,x5,x3 600 1.1 christos adcs x6,x4,xzr 601 1.1 christos 602 1.1 christos mul x3,x7,x12 603 1.1 christos umulh x4,x7,x12 604 1.1 christos adds x5,x5,x3 605 1.1 christos adcs x6,x6,x4 606 1.1 christos adcs x15,xzr,xzr 607 1.1 christos 608 1.1 christos // ### s2*s4 + s1*s5 + s0*s6 ### 609 1.1 christos mul x3,x9,x11 610 1.1 christos umulh x4,x9,x11 611 1.1 christos adds x6,x6,x3 612 1.1 christos adcs x15,x15,x4 613 1.1 christos 614 1.1 christos mul x3,x8,x12 615 1.1 christos umulh x4,x8,x12 616 1.1 christos adds x6,x6,x3 617 1.1 christos adcs x15,x15,x4 618 1.1 christos adcs x17,xzr,xzr 619 1.1 christos 620 1.1 christos mul x3,x7,x13 621 1.1 christos umulh x4,x7,x13 622 1.1 christos adds x6,x6,x3 623 1.1 christos adcs x15,x15,x4 624 1.1 christos adcs x17,x17,xzr 625 1.1 christos 626 1.1 christos // ### s3*s4 + s2*s5 + s1*s6 + s0*s7 ### 627 1.1 christos mul x3,x10,x11 628 1.1 christos umulh x4,x10,x11 629 1.1 christos adds x15,x15,x3 630 1.1 christos adcs x17,x17,x4 631 1.1 christos adcs x19,xzr,xzr 632 1.1 christos 633 1.1 christos mul x3,x9,x12 634 1.1 christos umulh x4,x9,x12 635 1.1 christos adds x15,x15,x3 636 1.1 christos adcs x17,x17,x4 637 1.1 christos adcs x19,x19,xzr 638 1.1 christos 639 1.1 christos mul x3,x8,x13 640 1.1 christos umulh x4,x8,x13 641 1.1 christos adds x15,x15,x3 642 1.1 christos adcs x17,x17,x4 643 1.1 christos adcs x19,x19,xzr 644 1.1 christos 645 1.1 christos mul x3,x7,x14 646 1.1 christos umulh x4,x7,x14 647 1.1 christos adds x15,x15,x3 648 1.1 christos adcs x17,x17,x4 649 1.1 christos adcs x19,x19,xzr 650 1.1 christos 651 1.1 christos // ### s3*s5 + s2*s6 + s1*s7 ### 652 1.1 christos mul x3,x10,x12 653 1.1 christos umulh x4,x10,x12 654 1.1 christos adds x17,x17,x3 655 1.1 christos adcs x19,x19,x4 656 1.1 christos adcs x20,xzr,xzr 657 1.1 christos 658 1.1 christos mul x3,x9,x13 659 1.1 christos umulh x4,x9,x13 660 1.1 christos adds x17,x17,x3 661 1.1 christos adcs x19,x19,x4 662 1.1 christos adcs x20,x20,xzr 663 1.1 christos 664 1.1 christos mul x3,x8,x14 665 1.1 christos umulh x4,x8,x14 666 1.1 christos adds x11,x17,x3 667 1.1 christos adcs x19,x19,x4 668 1.1 christos adcs x20,x20,xzr 669 1.1 christos 670 1.1 christos // ### s3*s6 + s2*s7 ### 671 1.1 christos mul x3,x10,x13 672 1.1 christos umulh x4,x10,x13 673 1.1 christos adds x19,x19,x3 674 1.1 christos adcs x20,x20,x4 675 1.1 christos adcs x17,xzr,xzr 676 1.1 christos 677 1.1 christos mul x3,x9,x14 678 1.1 christos umulh x4,x9,x14 679 1.1 christos adds x12,x19,x3 680 1.1 christos adcs x20,x20,x4 681 1.1 christos adcs x17,x17,xzr 682 1.1 christos 683 1.1 christos // ### s3*s7 ### 684 1.1 christos mul x3,x10,x14 685 1.1 christos umulh x4,x10,x14 686 1.1 christos adds x13,x20,x3 687 1.1 christos adcs x14,x17,x4 688 1.1 christos 689 1.1 christos mov x7,x16 690 1.1 christos mov x8,x5 691 1.1 christos mov x9,x6 692 1.1 christos mov x10,x15 693 1.1 christos 694 1.1 christos // result of mul: s7 s6 s5 s4 s3 s2 s1 s0 695 1.1 christos 696 1.1 christos // ### Reduction ### 697 1.1 christos RDC 698 1.1 christos 699 1.1 christos stp x7,x8,[x0] 700 1.1 christos stp x9,x10,[x0,#16] 701 1.1 christos 702 1.1 christos // Restore scalar registers 703 1.1 christos ldp x16,x17,[sp,#16] 704 1.1 christos ldp x19,x20,[sp,#64] 705 1.1 christos ldp x29,x30,[sp],#80 706 1.1 christos 707 1.1 christos AARCH64_VALIDATE_LINK_REGISTER 708 1.1 christos ret 709 1.1 christos .size ecp_sm2p256_mul,.-ecp_sm2p256_mul 710 1.1 christos 711 1.1 christos // void ecp_sm2p256_sqr(BN_ULONG *r, const BN_ULONG *a); 712 1.1 christos .globl ecp_sm2p256_sqr 713 1.1 christos .type ecp_sm2p256_sqr,%function 714 1.1 christos .align 5 715 1.1 christos 716 1.1 christos ecp_sm2p256_sqr: 717 1.1 christos AARCH64_SIGN_LINK_REGISTER 718 1.1 christos // Store scalar registers 719 1.1 christos stp x29,x30,[sp,#-80]! 720 1.1 christos add x29,sp,#0 721 1.1 christos stp x16,x17,[sp,#16] 722 1.1 christos stp x19,x20,[sp,#64] 723 1.1 christos 724 1.1 christos // Load inputs 725 1.1 christos ldp x11,x12,[x1] 726 1.1 christos ldp x13,x14,[x1,#16] 727 1.1 christos 728 1.1 christos // ### square ### 729 1.1 christos // ======================== 730 1.1 christos // s7 s6 s5 s4 731 1.1 christos // * s7 s6 s5 s4 732 1.1 christos // ------------------------ 733 1.1 christos // + s4 s4 s4 s4 734 1.1 christos // * * * * 735 1.1 christos // s7 s6 s5 s4 736 1.1 christos // s5 s5 s5 s5 737 1.1 christos // * * * * 738 1.1 christos // s7 s6 s5 s4 739 1.1 christos // s6 s6 s6 s6 740 1.1 christos // * * * * 741 1.1 christos // s7 s6 s5 s4 742 1.1 christos // s7 s7 s7 s7 743 1.1 christos // * * * * 744 1.1 christos // s7 s6 s5 s4 745 1.1 christos // ------------------------ 746 1.1 christos // s7 s6 s5 s4 s3 s2 s1 s0 747 1.1 christos // ======================== 748 1.1 christos 749 1.1 christos // ### s4*s5 ### 750 1.1 christos mul x8,x11,x12 751 1.1 christos umulh x9,x11,x12 752 1.1 christos 753 1.1 christos // ### s4*s6 ### 754 1.1 christos mul x3,x13,x11 755 1.1 christos umulh x10,x13,x11 756 1.1 christos adds x9,x9,x3 757 1.1 christos adcs x10,x10,xzr 758 1.1 christos 759 1.1 christos // ### s4*s7 + s5*s6 ### 760 1.1 christos mul x3,x14,x11 761 1.1 christos umulh x4,x14,x11 762 1.1 christos adds x10,x10,x3 763 1.1 christos adcs x7,x4,xzr 764 1.1 christos 765 1.1 christos mul x3,x13,x12 766 1.1 christos umulh x4,x13,x12 767 1.1 christos adds x10,x10,x3 768 1.1 christos adcs x7,x7,x4 769 1.1 christos adcs x5,xzr,xzr 770 1.1 christos 771 1.1 christos // ### s5*s7 ### 772 1.1 christos mul x3,x14,x12 773 1.1 christos umulh x4,x14,x12 774 1.1 christos adds x7,x7,x3 775 1.1 christos adcs x5,x5,x4 776 1.1 christos 777 1.1 christos // ### s6*s7 ### 778 1.1 christos mul x3,x14,x13 779 1.1 christos umulh x4,x14,x13 780 1.1 christos adds x5,x5,x3 781 1.1 christos adcs x6,x4,xzr 782 1.1 christos 783 1.1 christos // ### 2*(t3,t2,s0,s3,s2,s1) ### 784 1.1 christos adds x8,x8,x8 785 1.1 christos adcs x9,x9,x9 786 1.1 christos adcs x10,x10,x10 787 1.1 christos adcs x7,x7,x7 788 1.1 christos adcs x5,x5,x5 789 1.1 christos adcs x6,x6,x6 790 1.1 christos adcs x15,xzr,xzr 791 1.1 christos 792 1.1 christos // ### s4*s4 ### 793 1.1 christos mul x16,x11,x11 794 1.1 christos umulh x17,x11,x11 795 1.1 christos 796 1.1 christos // ### s5*s5 ### 797 1.1 christos mul x11,x12,x12 798 1.1 christos umulh x12,x12,x12 799 1.1 christos 800 1.1 christos // ### s6*s6 ### 801 1.1 christos mul x3,x13,x13 802 1.1 christos umulh x4,x13,x13 803 1.1 christos 804 1.1 christos // ### s7*s7 ### 805 1.1 christos mul x19,x14,x14 806 1.1 christos umulh x20,x14,x14 807 1.1 christos 808 1.1 christos adds x8,x8,x17 809 1.1 christos adcs x9,x9,x11 810 1.1 christos adcs x10,x10,x12 811 1.1 christos adcs x7,x7,x3 812 1.1 christos adcs x5,x5,x4 813 1.1 christos adcs x6,x6,x19 814 1.1 christos adcs x15,x15,x20 815 1.1 christos 816 1.1 christos mov x11,x7 817 1.1 christos mov x7,x16 818 1.1 christos mov x12,x5 819 1.1 christos mov x13,x6 820 1.1 christos mov x14,x15 821 1.1 christos 822 1.1 christos // result of mul: s7 s6 s5 s4 s3 s2 s1 s0 823 1.1 christos 824 1.1 christos // ### Reduction ### 825 1.1 christos RDC 826 1.1 christos 827 1.1 christos stp x7,x8,[x0] 828 1.1 christos stp x9,x10,[x0,#16] 829 1.1 christos 830 1.1 christos // Restore scalar registers 831 1.1 christos ldp x16,x17,[sp,#16] 832 1.1 christos ldp x19,x20,[sp,#64] 833 1.1 christos ldp x29,x30,[sp],#80 834 1.1 christos 835 1.1 christos AARCH64_VALIDATE_LINK_REGISTER 836 1.1 christos ret 837 1.1 christos .size ecp_sm2p256_sqr,.-ecp_sm2p256_sqr 838