1 1.1 christos #include "arm_arch.h" 2 1.1 christos 3 1.1 christos .text 4 1.1 christos 5 1.1 christos // forward "declarations" are required for Apple 6 1.1 christos 7 1.3 christos .hidden OPENSSL_armcap_P 8 1.3 christos .globl poly1305_init 9 1.3 christos .hidden poly1305_init 10 1.1 christos .globl poly1305_blocks 11 1.3 christos .hidden poly1305_blocks 12 1.1 christos .globl poly1305_emit 13 1.3 christos .hidden poly1305_emit 14 1.1 christos 15 1.1 christos .type poly1305_init,%function 16 1.1 christos .align 5 17 1.1 christos poly1305_init: 18 1.1 christos cmp x1,xzr 19 1.1 christos stp xzr,xzr,[x0] // zero hash value 20 1.1 christos stp xzr,xzr,[x0,#16] // [along with is_base2_26] 21 1.1 christos 22 1.1 christos csel x0,xzr,x0,eq 23 1.1 christos b.eq .Lno_key 24 1.1 christos 25 1.4 christos adrp x17,OPENSSL_armcap_P 26 1.4 christos ldr w17,[x17,#:lo12:OPENSSL_armcap_P] 27 1.1 christos 28 1.1 christos ldp x7,x8,[x1] // load key 29 1.1 christos mov x9,#0xfffffffc0fffffff 30 1.1 christos movk x9,#0x0fff,lsl#48 31 1.5 christos #ifdef __AARCH64EB__ 32 1.1 christos rev x7,x7 // flip bytes 33 1.1 christos rev x8,x8 34 1.1 christos #endif 35 1.1 christos and x7,x7,x9 // &=0ffffffc0fffffff 36 1.1 christos and x9,x9,#-4 37 1.1 christos and x8,x8,x9 // &=0ffffffc0ffffffc 38 1.1 christos stp x7,x8,[x0,#32] // save key value 39 1.1 christos 40 1.1 christos tst w17,#ARMV7_NEON 41 1.1 christos 42 1.4 christos adr x12,.Lpoly1305_blocks 43 1.4 christos adr x7,.Lpoly1305_blocks_neon 44 1.4 christos adr x13,.Lpoly1305_emit 45 1.4 christos adr x8,.Lpoly1305_emit_neon 46 1.1 christos 47 1.1 christos csel x12,x12,x7,eq 48 1.1 christos csel x13,x13,x8,eq 49 1.1 christos 50 1.1 christos #ifdef __ILP32__ 51 1.1 christos stp w12,w13,[x2] 52 1.1 christos #else 53 1.1 christos stp x12,x13,[x2] 54 1.1 christos #endif 55 1.1 christos 56 1.1 christos mov x0,#1 57 1.1 christos .Lno_key: 58 1.1 christos ret 59 1.1 christos .size poly1305_init,.-poly1305_init 60 1.1 christos 61 1.1 christos .type poly1305_blocks,%function 62 1.1 christos .align 5 63 1.1 christos poly1305_blocks: 64 1.4 christos .Lpoly1305_blocks: 65 1.1 christos ands x2,x2,#-16 66 1.1 christos b.eq .Lno_data 67 1.1 christos 68 1.1 christos ldp x4,x5,[x0] // load hash value 69 1.1 christos ldp x7,x8,[x0,#32] // load key value 70 1.1 christos ldr x6,[x0,#16] 71 1.1 christos add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 72 1.1 christos b .Loop 73 1.1 christos 74 1.1 christos .align 5 75 1.1 christos .Loop: 76 1.1 christos ldp x10,x11,[x1],#16 // load input 77 1.1 christos sub x2,x2,#16 78 1.5 christos #ifdef __AARCH64EB__ 79 1.1 christos rev x10,x10 80 1.1 christos rev x11,x11 81 1.1 christos #endif 82 1.1 christos adds x4,x4,x10 // accumulate input 83 1.1 christos adcs x5,x5,x11 84 1.1 christos 85 1.1 christos mul x12,x4,x7 // h0*r0 86 1.1 christos adc x6,x6,x3 87 1.1 christos umulh x13,x4,x7 88 1.1 christos 89 1.1 christos mul x10,x5,x9 // h1*5*r1 90 1.1 christos umulh x11,x5,x9 91 1.1 christos 92 1.1 christos adds x12,x12,x10 93 1.1 christos mul x10,x4,x8 // h0*r1 94 1.1 christos adc x13,x13,x11 95 1.1 christos umulh x14,x4,x8 96 1.1 christos 97 1.1 christos adds x13,x13,x10 98 1.1 christos mul x10,x5,x7 // h1*r0 99 1.1 christos adc x14,x14,xzr 100 1.1 christos umulh x11,x5,x7 101 1.1 christos 102 1.1 christos adds x13,x13,x10 103 1.1 christos mul x10,x6,x9 // h2*5*r1 104 1.1 christos adc x14,x14,x11 105 1.1 christos mul x11,x6,x7 // h2*r0 106 1.1 christos 107 1.1 christos adds x13,x13,x10 108 1.1 christos adc x14,x14,x11 109 1.1 christos 110 1.1 christos and x10,x14,#-4 // final reduction 111 1.1 christos and x6,x14,#3 112 1.1 christos add x10,x10,x14,lsr#2 113 1.1 christos adds x4,x12,x10 114 1.1 christos adcs x5,x13,xzr 115 1.1 christos adc x6,x6,xzr 116 1.1 christos 117 1.1 christos cbnz x2,.Loop 118 1.1 christos 119 1.1 christos stp x4,x5,[x0] // store hash value 120 1.1 christos str x6,[x0,#16] 121 1.1 christos 122 1.1 christos .Lno_data: 123 1.1 christos ret 124 1.1 christos .size poly1305_blocks,.-poly1305_blocks 125 1.1 christos 126 1.1 christos .type poly1305_emit,%function 127 1.1 christos .align 5 128 1.1 christos poly1305_emit: 129 1.4 christos .Lpoly1305_emit: 130 1.1 christos ldp x4,x5,[x0] // load hash base 2^64 131 1.1 christos ldr x6,[x0,#16] 132 1.1 christos ldp x10,x11,[x2] // load nonce 133 1.1 christos 134 1.1 christos adds x12,x4,#5 // compare to modulus 135 1.1 christos adcs x13,x5,xzr 136 1.1 christos adc x14,x6,xzr 137 1.1 christos 138 1.1 christos tst x14,#-4 // see if it's carried/borrowed 139 1.1 christos 140 1.1 christos csel x4,x4,x12,eq 141 1.1 christos csel x5,x5,x13,eq 142 1.1 christos 143 1.5 christos #ifdef __AARCH64EB__ 144 1.1 christos ror x10,x10,#32 // flip nonce words 145 1.1 christos ror x11,x11,#32 146 1.1 christos #endif 147 1.1 christos adds x4,x4,x10 // accumulate nonce 148 1.1 christos adc x5,x5,x11 149 1.5 christos #ifdef __AARCH64EB__ 150 1.1 christos rev x4,x4 // flip output bytes 151 1.1 christos rev x5,x5 152 1.1 christos #endif 153 1.1 christos stp x4,x5,[x1] // write result 154 1.1 christos 155 1.1 christos ret 156 1.1 christos .size poly1305_emit,.-poly1305_emit 157 1.1 christos .type poly1305_mult,%function 158 1.1 christos .align 5 159 1.1 christos poly1305_mult: 160 1.1 christos mul x12,x4,x7 // h0*r0 161 1.1 christos umulh x13,x4,x7 162 1.1 christos 163 1.1 christos mul x10,x5,x9 // h1*5*r1 164 1.1 christos umulh x11,x5,x9 165 1.1 christos 166 1.1 christos adds x12,x12,x10 167 1.1 christos mul x10,x4,x8 // h0*r1 168 1.1 christos adc x13,x13,x11 169 1.1 christos umulh x14,x4,x8 170 1.1 christos 171 1.1 christos adds x13,x13,x10 172 1.1 christos mul x10,x5,x7 // h1*r0 173 1.1 christos adc x14,x14,xzr 174 1.1 christos umulh x11,x5,x7 175 1.1 christos 176 1.1 christos adds x13,x13,x10 177 1.1 christos mul x10,x6,x9 // h2*5*r1 178 1.1 christos adc x14,x14,x11 179 1.1 christos mul x11,x6,x7 // h2*r0 180 1.1 christos 181 1.1 christos adds x13,x13,x10 182 1.1 christos adc x14,x14,x11 183 1.1 christos 184 1.1 christos and x10,x14,#-4 // final reduction 185 1.1 christos and x6,x14,#3 186 1.1 christos add x10,x10,x14,lsr#2 187 1.1 christos adds x4,x12,x10 188 1.1 christos adcs x5,x13,xzr 189 1.1 christos adc x6,x6,xzr 190 1.1 christos 191 1.1 christos ret 192 1.1 christos .size poly1305_mult,.-poly1305_mult 193 1.1 christos 194 1.1 christos .type poly1305_splat,%function 195 1.1 christos .align 5 196 1.1 christos poly1305_splat: 197 1.1 christos and x12,x4,#0x03ffffff // base 2^64 -> base 2^26 198 1.1 christos ubfx x13,x4,#26,#26 199 1.1 christos extr x14,x5,x4,#52 200 1.1 christos and x14,x14,#0x03ffffff 201 1.1 christos ubfx x15,x5,#14,#26 202 1.1 christos extr x16,x6,x5,#40 203 1.1 christos 204 1.1 christos str w12,[x0,#16*0] // r0 205 1.1 christos add w12,w13,w13,lsl#2 // r1*5 206 1.1 christos str w13,[x0,#16*1] // r1 207 1.1 christos add w13,w14,w14,lsl#2 // r2*5 208 1.1 christos str w12,[x0,#16*2] // s1 209 1.1 christos str w14,[x0,#16*3] // r2 210 1.1 christos add w14,w15,w15,lsl#2 // r3*5 211 1.1 christos str w13,[x0,#16*4] // s2 212 1.1 christos str w15,[x0,#16*5] // r3 213 1.1 christos add w15,w16,w16,lsl#2 // r4*5 214 1.1 christos str w14,[x0,#16*6] // s3 215 1.1 christos str w16,[x0,#16*7] // r4 216 1.1 christos str w15,[x0,#16*8] // s4 217 1.1 christos 218 1.1 christos ret 219 1.1 christos .size poly1305_splat,.-poly1305_splat 220 1.1 christos 221 1.1 christos .type poly1305_blocks_neon,%function 222 1.1 christos .align 5 223 1.1 christos poly1305_blocks_neon: 224 1.4 christos .Lpoly1305_blocks_neon: 225 1.1 christos ldr x17,[x0,#24] 226 1.1 christos cmp x2,#128 227 1.1 christos b.hs .Lblocks_neon 228 1.4 christos cbz x17,.Lpoly1305_blocks 229 1.1 christos 230 1.1 christos .Lblocks_neon: 231 1.2 christos .inst 0xd503233f // paciasp 232 1.1 christos stp x29,x30,[sp,#-80]! 233 1.1 christos add x29,sp,#0 234 1.1 christos 235 1.1 christos ands x2,x2,#-16 236 1.1 christos b.eq .Lno_data_neon 237 1.1 christos 238 1.1 christos cbz x17,.Lbase2_64_neon 239 1.1 christos 240 1.1 christos ldp w10,w11,[x0] // load hash value base 2^26 241 1.1 christos ldp w12,w13,[x0,#8] 242 1.1 christos ldr w14,[x0,#16] 243 1.1 christos 244 1.1 christos tst x2,#31 245 1.1 christos b.eq .Leven_neon 246 1.1 christos 247 1.1 christos ldp x7,x8,[x0,#32] // load key value 248 1.1 christos 249 1.1 christos add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 250 1.1 christos lsr x5,x12,#12 251 1.1 christos adds x4,x4,x12,lsl#52 252 1.1 christos add x5,x5,x13,lsl#14 253 1.1 christos adc x5,x5,xzr 254 1.1 christos lsr x6,x14,#24 255 1.1 christos adds x5,x5,x14,lsl#40 256 1.1 christos adc x14,x6,xzr // can be partially reduced... 257 1.1 christos 258 1.1 christos ldp x12,x13,[x1],#16 // load input 259 1.1 christos sub x2,x2,#16 260 1.1 christos add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 261 1.1 christos 262 1.1 christos and x10,x14,#-4 // ... so reduce 263 1.1 christos and x6,x14,#3 264 1.1 christos add x10,x10,x14,lsr#2 265 1.1 christos adds x4,x4,x10 266 1.1 christos adcs x5,x5,xzr 267 1.1 christos adc x6,x6,xzr 268 1.1 christos 269 1.5 christos #ifdef __AARCH64EB__ 270 1.1 christos rev x12,x12 271 1.1 christos rev x13,x13 272 1.1 christos #endif 273 1.1 christos adds x4,x4,x12 // accumulate input 274 1.1 christos adcs x5,x5,x13 275 1.1 christos adc x6,x6,x3 276 1.1 christos 277 1.1 christos bl poly1305_mult 278 1.1 christos ldr x30,[sp,#8] 279 1.1 christos 280 1.1 christos cbz x3,.Lstore_base2_64_neon 281 1.1 christos 282 1.1 christos and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 283 1.1 christos ubfx x11,x4,#26,#26 284 1.1 christos extr x12,x5,x4,#52 285 1.1 christos and x12,x12,#0x03ffffff 286 1.1 christos ubfx x13,x5,#14,#26 287 1.1 christos extr x14,x6,x5,#40 288 1.1 christos 289 1.1 christos cbnz x2,.Leven_neon 290 1.1 christos 291 1.1 christos stp w10,w11,[x0] // store hash value base 2^26 292 1.1 christos stp w12,w13,[x0,#8] 293 1.1 christos str w14,[x0,#16] 294 1.1 christos b .Lno_data_neon 295 1.1 christos 296 1.1 christos .align 4 297 1.1 christos .Lstore_base2_64_neon: 298 1.1 christos stp x4,x5,[x0] // store hash value base 2^64 299 1.1 christos stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed 300 1.1 christos b .Lno_data_neon 301 1.1 christos 302 1.1 christos .align 4 303 1.1 christos .Lbase2_64_neon: 304 1.1 christos ldp x7,x8,[x0,#32] // load key value 305 1.1 christos 306 1.1 christos ldp x4,x5,[x0] // load hash value base 2^64 307 1.1 christos ldr x6,[x0,#16] 308 1.1 christos 309 1.1 christos tst x2,#31 310 1.1 christos b.eq .Linit_neon 311 1.1 christos 312 1.1 christos ldp x12,x13,[x1],#16 // load input 313 1.1 christos sub x2,x2,#16 314 1.1 christos add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 315 1.5 christos #ifdef __AARCH64EB__ 316 1.1 christos rev x12,x12 317 1.1 christos rev x13,x13 318 1.1 christos #endif 319 1.1 christos adds x4,x4,x12 // accumulate input 320 1.1 christos adcs x5,x5,x13 321 1.1 christos adc x6,x6,x3 322 1.1 christos 323 1.1 christos bl poly1305_mult 324 1.1 christos 325 1.1 christos .Linit_neon: 326 1.1 christos and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 327 1.1 christos ubfx x11,x4,#26,#26 328 1.1 christos extr x12,x5,x4,#52 329 1.1 christos and x12,x12,#0x03ffffff 330 1.1 christos ubfx x13,x5,#14,#26 331 1.1 christos extr x14,x6,x5,#40 332 1.1 christos 333 1.1 christos stp d8,d9,[sp,#16] // meet ABI requirements 334 1.1 christos stp d10,d11,[sp,#32] 335 1.1 christos stp d12,d13,[sp,#48] 336 1.1 christos stp d14,d15,[sp,#64] 337 1.1 christos 338 1.1 christos fmov d24,x10 339 1.1 christos fmov d25,x11 340 1.1 christos fmov d26,x12 341 1.1 christos fmov d27,x13 342 1.1 christos fmov d28,x14 343 1.1 christos 344 1.1 christos ////////////////////////////////// initialize r^n table 345 1.1 christos mov x4,x7 // r^1 346 1.1 christos add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 347 1.1 christos mov x5,x8 348 1.1 christos mov x6,xzr 349 1.1 christos add x0,x0,#48+12 350 1.1 christos bl poly1305_splat 351 1.1 christos 352 1.1 christos bl poly1305_mult // r^2 353 1.1 christos sub x0,x0,#4 354 1.1 christos bl poly1305_splat 355 1.1 christos 356 1.1 christos bl poly1305_mult // r^3 357 1.1 christos sub x0,x0,#4 358 1.1 christos bl poly1305_splat 359 1.1 christos 360 1.1 christos bl poly1305_mult // r^4 361 1.1 christos sub x0,x0,#4 362 1.1 christos bl poly1305_splat 363 1.1 christos ldr x30,[sp,#8] 364 1.1 christos 365 1.1 christos add x16,x1,#32 366 1.1 christos adr x17,.Lzeros 367 1.1 christos subs x2,x2,#64 368 1.1 christos csel x16,x17,x16,lo 369 1.1 christos 370 1.1 christos mov x4,#1 371 1.4 christos stur x4,[x0,#-24] // set is_base2_26 372 1.1 christos sub x0,x0,#48 // restore original x0 373 1.1 christos b .Ldo_neon 374 1.1 christos 375 1.1 christos .align 4 376 1.1 christos .Leven_neon: 377 1.1 christos add x16,x1,#32 378 1.1 christos adr x17,.Lzeros 379 1.1 christos subs x2,x2,#64 380 1.1 christos csel x16,x17,x16,lo 381 1.1 christos 382 1.1 christos stp d8,d9,[sp,#16] // meet ABI requirements 383 1.1 christos stp d10,d11,[sp,#32] 384 1.1 christos stp d12,d13,[sp,#48] 385 1.1 christos stp d14,d15,[sp,#64] 386 1.1 christos 387 1.1 christos fmov d24,x10 388 1.1 christos fmov d25,x11 389 1.1 christos fmov d26,x12 390 1.1 christos fmov d27,x13 391 1.1 christos fmov d28,x14 392 1.1 christos 393 1.1 christos .Ldo_neon: 394 1.1 christos ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 395 1.1 christos ldp x9,x13,[x16],#48 396 1.1 christos 397 1.1 christos lsl x3,x3,#24 398 1.1 christos add x15,x0,#48 399 1.1 christos 400 1.5 christos #ifdef __AARCH64EB__ 401 1.1 christos rev x8,x8 402 1.1 christos rev x12,x12 403 1.1 christos rev x9,x9 404 1.1 christos rev x13,x13 405 1.1 christos #endif 406 1.1 christos and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 407 1.1 christos and x5,x9,#0x03ffffff 408 1.1 christos ubfx x6,x8,#26,#26 409 1.1 christos ubfx x7,x9,#26,#26 410 1.1 christos add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 411 1.1 christos extr x8,x12,x8,#52 412 1.1 christos extr x9,x13,x9,#52 413 1.1 christos add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 414 1.1 christos fmov d14,x4 415 1.1 christos and x8,x8,#0x03ffffff 416 1.1 christos and x9,x9,#0x03ffffff 417 1.1 christos ubfx x10,x12,#14,#26 418 1.1 christos ubfx x11,x13,#14,#26 419 1.1 christos add x12,x3,x12,lsr#40 420 1.1 christos add x13,x3,x13,lsr#40 421 1.1 christos add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 422 1.1 christos fmov d15,x6 423 1.1 christos add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 424 1.1 christos add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 425 1.1 christos fmov d16,x8 426 1.1 christos fmov d17,x10 427 1.1 christos fmov d18,x12 428 1.1 christos 429 1.1 christos ldp x8,x12,[x1],#16 // inp[0:1] 430 1.1 christos ldp x9,x13,[x1],#48 431 1.1 christos 432 1.1 christos ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64 433 1.1 christos ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64 434 1.1 christos ld1 {v8.4s},[x15] 435 1.1 christos 436 1.5 christos #ifdef __AARCH64EB__ 437 1.1 christos rev x8,x8 438 1.1 christos rev x12,x12 439 1.1 christos rev x9,x9 440 1.1 christos rev x13,x13 441 1.1 christos #endif 442 1.1 christos and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 443 1.1 christos and x5,x9,#0x03ffffff 444 1.1 christos ubfx x6,x8,#26,#26 445 1.1 christos ubfx x7,x9,#26,#26 446 1.1 christos add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 447 1.1 christos extr x8,x12,x8,#52 448 1.1 christos extr x9,x13,x9,#52 449 1.1 christos add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 450 1.1 christos fmov d9,x4 451 1.1 christos and x8,x8,#0x03ffffff 452 1.1 christos and x9,x9,#0x03ffffff 453 1.1 christos ubfx x10,x12,#14,#26 454 1.1 christos ubfx x11,x13,#14,#26 455 1.1 christos add x12,x3,x12,lsr#40 456 1.1 christos add x13,x3,x13,lsr#40 457 1.1 christos add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 458 1.1 christos fmov d10,x6 459 1.1 christos add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 460 1.1 christos add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 461 1.1 christos movi v31.2d,#-1 462 1.1 christos fmov d11,x8 463 1.1 christos fmov d12,x10 464 1.1 christos fmov d13,x12 465 1.1 christos ushr v31.2d,v31.2d,#38 466 1.1 christos 467 1.1 christos b.ls .Lskip_loop 468 1.1 christos 469 1.1 christos .align 4 470 1.1 christos .Loop_neon: 471 1.1 christos //////////////////////////////////////////////////////////////// 472 1.1 christos // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 473 1.1 christos // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 474 1.1 christos // ___________________/ 475 1.1 christos // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 476 1.1 christos // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 477 1.1 christos // ___________________/ ____________________/ 478 1.1 christos // 479 1.1 christos // Note that we start with inp[2:3]*r^2. This is because it 480 1.1 christos // doesn't depend on reduction in previous iteration. 481 1.1 christos //////////////////////////////////////////////////////////////// 482 1.1 christos // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 483 1.1 christos // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 484 1.1 christos // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 485 1.1 christos // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 486 1.1 christos // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 487 1.1 christos 488 1.1 christos subs x2,x2,#64 489 1.1 christos umull v23.2d,v14.2s,v7.s[2] 490 1.1 christos csel x16,x17,x16,lo 491 1.1 christos umull v22.2d,v14.2s,v5.s[2] 492 1.1 christos umull v21.2d,v14.2s,v3.s[2] 493 1.1 christos ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 494 1.1 christos umull v20.2d,v14.2s,v1.s[2] 495 1.1 christos ldp x9,x13,[x16],#48 496 1.1 christos umull v19.2d,v14.2s,v0.s[2] 497 1.5 christos #ifdef __AARCH64EB__ 498 1.1 christos rev x8,x8 499 1.1 christos rev x12,x12 500 1.1 christos rev x9,x9 501 1.1 christos rev x13,x13 502 1.1 christos #endif 503 1.1 christos 504 1.1 christos umlal v23.2d,v15.2s,v5.s[2] 505 1.1 christos and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 506 1.1 christos umlal v22.2d,v15.2s,v3.s[2] 507 1.1 christos and x5,x9,#0x03ffffff 508 1.1 christos umlal v21.2d,v15.2s,v1.s[2] 509 1.1 christos ubfx x6,x8,#26,#26 510 1.1 christos umlal v20.2d,v15.2s,v0.s[2] 511 1.1 christos ubfx x7,x9,#26,#26 512 1.1 christos umlal v19.2d,v15.2s,v8.s[2] 513 1.1 christos add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 514 1.1 christos 515 1.1 christos umlal v23.2d,v16.2s,v3.s[2] 516 1.1 christos extr x8,x12,x8,#52 517 1.1 christos umlal v22.2d,v16.2s,v1.s[2] 518 1.1 christos extr x9,x13,x9,#52 519 1.1 christos umlal v21.2d,v16.2s,v0.s[2] 520 1.1 christos add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 521 1.1 christos umlal v20.2d,v16.2s,v8.s[2] 522 1.1 christos fmov d14,x4 523 1.1 christos umlal v19.2d,v16.2s,v6.s[2] 524 1.1 christos and x8,x8,#0x03ffffff 525 1.1 christos 526 1.1 christos umlal v23.2d,v17.2s,v1.s[2] 527 1.1 christos and x9,x9,#0x03ffffff 528 1.1 christos umlal v22.2d,v17.2s,v0.s[2] 529 1.1 christos ubfx x10,x12,#14,#26 530 1.1 christos umlal v21.2d,v17.2s,v8.s[2] 531 1.1 christos ubfx x11,x13,#14,#26 532 1.1 christos umlal v20.2d,v17.2s,v6.s[2] 533 1.1 christos add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 534 1.1 christos umlal v19.2d,v17.2s,v4.s[2] 535 1.1 christos fmov d15,x6 536 1.1 christos 537 1.1 christos add v11.2s,v11.2s,v26.2s 538 1.1 christos add x12,x3,x12,lsr#40 539 1.1 christos umlal v23.2d,v18.2s,v0.s[2] 540 1.1 christos add x13,x3,x13,lsr#40 541 1.1 christos umlal v22.2d,v18.2s,v8.s[2] 542 1.1 christos add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 543 1.1 christos umlal v21.2d,v18.2s,v6.s[2] 544 1.1 christos add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 545 1.1 christos umlal v20.2d,v18.2s,v4.s[2] 546 1.1 christos fmov d16,x8 547 1.1 christos umlal v19.2d,v18.2s,v2.s[2] 548 1.1 christos fmov d17,x10 549 1.1 christos 550 1.1 christos //////////////////////////////////////////////////////////////// 551 1.1 christos // (hash+inp[0:1])*r^4 and accumulate 552 1.1 christos 553 1.1 christos add v9.2s,v9.2s,v24.2s 554 1.1 christos fmov d18,x12 555 1.1 christos umlal v22.2d,v11.2s,v1.s[0] 556 1.1 christos ldp x8,x12,[x1],#16 // inp[0:1] 557 1.1 christos umlal v19.2d,v11.2s,v6.s[0] 558 1.1 christos ldp x9,x13,[x1],#48 559 1.1 christos umlal v23.2d,v11.2s,v3.s[0] 560 1.1 christos umlal v20.2d,v11.2s,v8.s[0] 561 1.1 christos umlal v21.2d,v11.2s,v0.s[0] 562 1.5 christos #ifdef __AARCH64EB__ 563 1.1 christos rev x8,x8 564 1.1 christos rev x12,x12 565 1.1 christos rev x9,x9 566 1.1 christos rev x13,x13 567 1.1 christos #endif 568 1.1 christos 569 1.1 christos add v10.2s,v10.2s,v25.2s 570 1.1 christos umlal v22.2d,v9.2s,v5.s[0] 571 1.1 christos umlal v23.2d,v9.2s,v7.s[0] 572 1.1 christos and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 573 1.1 christos umlal v21.2d,v9.2s,v3.s[0] 574 1.1 christos and x5,x9,#0x03ffffff 575 1.1 christos umlal v19.2d,v9.2s,v0.s[0] 576 1.1 christos ubfx x6,x8,#26,#26 577 1.1 christos umlal v20.2d,v9.2s,v1.s[0] 578 1.1 christos ubfx x7,x9,#26,#26 579 1.1 christos 580 1.1 christos add v12.2s,v12.2s,v27.2s 581 1.1 christos add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 582 1.1 christos umlal v22.2d,v10.2s,v3.s[0] 583 1.1 christos extr x8,x12,x8,#52 584 1.1 christos umlal v23.2d,v10.2s,v5.s[0] 585 1.1 christos extr x9,x13,x9,#52 586 1.1 christos umlal v19.2d,v10.2s,v8.s[0] 587 1.1 christos add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 588 1.1 christos umlal v21.2d,v10.2s,v1.s[0] 589 1.1 christos fmov d9,x4 590 1.1 christos umlal v20.2d,v10.2s,v0.s[0] 591 1.1 christos and x8,x8,#0x03ffffff 592 1.1 christos 593 1.1 christos add v13.2s,v13.2s,v28.2s 594 1.1 christos and x9,x9,#0x03ffffff 595 1.1 christos umlal v22.2d,v12.2s,v0.s[0] 596 1.1 christos ubfx x10,x12,#14,#26 597 1.1 christos umlal v19.2d,v12.2s,v4.s[0] 598 1.1 christos ubfx x11,x13,#14,#26 599 1.1 christos umlal v23.2d,v12.2s,v1.s[0] 600 1.1 christos add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 601 1.1 christos umlal v20.2d,v12.2s,v6.s[0] 602 1.1 christos fmov d10,x6 603 1.1 christos umlal v21.2d,v12.2s,v8.s[0] 604 1.1 christos add x12,x3,x12,lsr#40 605 1.1 christos 606 1.1 christos umlal v22.2d,v13.2s,v8.s[0] 607 1.1 christos add x13,x3,x13,lsr#40 608 1.1 christos umlal v19.2d,v13.2s,v2.s[0] 609 1.1 christos add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 610 1.1 christos umlal v23.2d,v13.2s,v0.s[0] 611 1.1 christos add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 612 1.1 christos umlal v20.2d,v13.2s,v4.s[0] 613 1.1 christos fmov d11,x8 614 1.1 christos umlal v21.2d,v13.2s,v6.s[0] 615 1.1 christos fmov d12,x10 616 1.1 christos fmov d13,x12 617 1.1 christos 618 1.1 christos ///////////////////////////////////////////////////////////////// 619 1.1 christos // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 620 1.1 christos // and P. Schwabe 621 1.1 christos // 622 1.1 christos // [see discussion in poly1305-armv4 module] 623 1.1 christos 624 1.1 christos ushr v29.2d,v22.2d,#26 625 1.1 christos xtn v27.2s,v22.2d 626 1.1 christos ushr v30.2d,v19.2d,#26 627 1.1 christos and v19.16b,v19.16b,v31.16b 628 1.1 christos add v23.2d,v23.2d,v29.2d // h3 -> h4 629 1.1 christos bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff 630 1.1 christos add v20.2d,v20.2d,v30.2d // h0 -> h1 631 1.1 christos 632 1.1 christos ushr v29.2d,v23.2d,#26 633 1.1 christos xtn v28.2s,v23.2d 634 1.1 christos ushr v30.2d,v20.2d,#26 635 1.1 christos xtn v25.2s,v20.2d 636 1.1 christos bic v28.2s,#0xfc,lsl#24 637 1.1 christos add v21.2d,v21.2d,v30.2d // h1 -> h2 638 1.1 christos 639 1.1 christos add v19.2d,v19.2d,v29.2d 640 1.1 christos shl v29.2d,v29.2d,#2 641 1.1 christos shrn v30.2s,v21.2d,#26 642 1.1 christos xtn v26.2s,v21.2d 643 1.1 christos add v19.2d,v19.2d,v29.2d // h4 -> h0 644 1.1 christos bic v25.2s,#0xfc,lsl#24 645 1.1 christos add v27.2s,v27.2s,v30.2s // h2 -> h3 646 1.1 christos bic v26.2s,#0xfc,lsl#24 647 1.1 christos 648 1.1 christos shrn v29.2s,v19.2d,#26 649 1.1 christos xtn v24.2s,v19.2d 650 1.1 christos ushr v30.2s,v27.2s,#26 651 1.1 christos bic v27.2s,#0xfc,lsl#24 652 1.1 christos bic v24.2s,#0xfc,lsl#24 653 1.1 christos add v25.2s,v25.2s,v29.2s // h0 -> h1 654 1.1 christos add v28.2s,v28.2s,v30.2s // h3 -> h4 655 1.1 christos 656 1.1 christos b.hi .Loop_neon 657 1.1 christos 658 1.1 christos .Lskip_loop: 659 1.1 christos dup v16.2d,v16.d[0] 660 1.1 christos add v11.2s,v11.2s,v26.2s 661 1.1 christos 662 1.1 christos //////////////////////////////////////////////////////////////// 663 1.1 christos // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 664 1.1 christos 665 1.1 christos adds x2,x2,#32 666 1.1 christos b.ne .Long_tail 667 1.1 christos 668 1.1 christos dup v16.2d,v11.d[0] 669 1.1 christos add v14.2s,v9.2s,v24.2s 670 1.1 christos add v17.2s,v12.2s,v27.2s 671 1.1 christos add v15.2s,v10.2s,v25.2s 672 1.1 christos add v18.2s,v13.2s,v28.2s 673 1.1 christos 674 1.1 christos .Long_tail: 675 1.1 christos dup v14.2d,v14.d[0] 676 1.1 christos umull2 v19.2d,v16.4s,v6.4s 677 1.1 christos umull2 v22.2d,v16.4s,v1.4s 678 1.1 christos umull2 v23.2d,v16.4s,v3.4s 679 1.1 christos umull2 v21.2d,v16.4s,v0.4s 680 1.1 christos umull2 v20.2d,v16.4s,v8.4s 681 1.1 christos 682 1.1 christos dup v15.2d,v15.d[0] 683 1.1 christos umlal2 v19.2d,v14.4s,v0.4s 684 1.1 christos umlal2 v21.2d,v14.4s,v3.4s 685 1.1 christos umlal2 v22.2d,v14.4s,v5.4s 686 1.1 christos umlal2 v23.2d,v14.4s,v7.4s 687 1.1 christos umlal2 v20.2d,v14.4s,v1.4s 688 1.1 christos 689 1.1 christos dup v17.2d,v17.d[0] 690 1.1 christos umlal2 v19.2d,v15.4s,v8.4s 691 1.1 christos umlal2 v22.2d,v15.4s,v3.4s 692 1.1 christos umlal2 v21.2d,v15.4s,v1.4s 693 1.1 christos umlal2 v23.2d,v15.4s,v5.4s 694 1.1 christos umlal2 v20.2d,v15.4s,v0.4s 695 1.1 christos 696 1.1 christos dup v18.2d,v18.d[0] 697 1.1 christos umlal2 v22.2d,v17.4s,v0.4s 698 1.1 christos umlal2 v23.2d,v17.4s,v1.4s 699 1.1 christos umlal2 v19.2d,v17.4s,v4.4s 700 1.1 christos umlal2 v20.2d,v17.4s,v6.4s 701 1.1 christos umlal2 v21.2d,v17.4s,v8.4s 702 1.1 christos 703 1.1 christos umlal2 v22.2d,v18.4s,v8.4s 704 1.1 christos umlal2 v19.2d,v18.4s,v2.4s 705 1.1 christos umlal2 v23.2d,v18.4s,v0.4s 706 1.1 christos umlal2 v20.2d,v18.4s,v4.4s 707 1.1 christos umlal2 v21.2d,v18.4s,v6.4s 708 1.1 christos 709 1.1 christos b.eq .Lshort_tail 710 1.1 christos 711 1.1 christos //////////////////////////////////////////////////////////////// 712 1.1 christos // (hash+inp[0:1])*r^4:r^3 and accumulate 713 1.1 christos 714 1.1 christos add v9.2s,v9.2s,v24.2s 715 1.1 christos umlal v22.2d,v11.2s,v1.2s 716 1.1 christos umlal v19.2d,v11.2s,v6.2s 717 1.1 christos umlal v23.2d,v11.2s,v3.2s 718 1.1 christos umlal v20.2d,v11.2s,v8.2s 719 1.1 christos umlal v21.2d,v11.2s,v0.2s 720 1.1 christos 721 1.1 christos add v10.2s,v10.2s,v25.2s 722 1.1 christos umlal v22.2d,v9.2s,v5.2s 723 1.1 christos umlal v19.2d,v9.2s,v0.2s 724 1.1 christos umlal v23.2d,v9.2s,v7.2s 725 1.1 christos umlal v20.2d,v9.2s,v1.2s 726 1.1 christos umlal v21.2d,v9.2s,v3.2s 727 1.1 christos 728 1.1 christos add v12.2s,v12.2s,v27.2s 729 1.1 christos umlal v22.2d,v10.2s,v3.2s 730 1.1 christos umlal v19.2d,v10.2s,v8.2s 731 1.1 christos umlal v23.2d,v10.2s,v5.2s 732 1.1 christos umlal v20.2d,v10.2s,v0.2s 733 1.1 christos umlal v21.2d,v10.2s,v1.2s 734 1.1 christos 735 1.1 christos add v13.2s,v13.2s,v28.2s 736 1.1 christos umlal v22.2d,v12.2s,v0.2s 737 1.1 christos umlal v19.2d,v12.2s,v4.2s 738 1.1 christos umlal v23.2d,v12.2s,v1.2s 739 1.1 christos umlal v20.2d,v12.2s,v6.2s 740 1.1 christos umlal v21.2d,v12.2s,v8.2s 741 1.1 christos 742 1.1 christos umlal v22.2d,v13.2s,v8.2s 743 1.1 christos umlal v19.2d,v13.2s,v2.2s 744 1.1 christos umlal v23.2d,v13.2s,v0.2s 745 1.1 christos umlal v20.2d,v13.2s,v4.2s 746 1.1 christos umlal v21.2d,v13.2s,v6.2s 747 1.1 christos 748 1.1 christos .Lshort_tail: 749 1.1 christos //////////////////////////////////////////////////////////////// 750 1.1 christos // horizontal add 751 1.1 christos 752 1.1 christos addp v22.2d,v22.2d,v22.2d 753 1.1 christos ldp d8,d9,[sp,#16] // meet ABI requirements 754 1.1 christos addp v19.2d,v19.2d,v19.2d 755 1.1 christos ldp d10,d11,[sp,#32] 756 1.1 christos addp v23.2d,v23.2d,v23.2d 757 1.1 christos ldp d12,d13,[sp,#48] 758 1.1 christos addp v20.2d,v20.2d,v20.2d 759 1.1 christos ldp d14,d15,[sp,#64] 760 1.1 christos addp v21.2d,v21.2d,v21.2d 761 1.1 christos 762 1.1 christos //////////////////////////////////////////////////////////////// 763 1.1 christos // lazy reduction, but without narrowing 764 1.1 christos 765 1.1 christos ushr v29.2d,v22.2d,#26 766 1.1 christos and v22.16b,v22.16b,v31.16b 767 1.1 christos ushr v30.2d,v19.2d,#26 768 1.1 christos and v19.16b,v19.16b,v31.16b 769 1.1 christos 770 1.1 christos add v23.2d,v23.2d,v29.2d // h3 -> h4 771 1.1 christos add v20.2d,v20.2d,v30.2d // h0 -> h1 772 1.1 christos 773 1.1 christos ushr v29.2d,v23.2d,#26 774 1.1 christos and v23.16b,v23.16b,v31.16b 775 1.1 christos ushr v30.2d,v20.2d,#26 776 1.1 christos and v20.16b,v20.16b,v31.16b 777 1.1 christos add v21.2d,v21.2d,v30.2d // h1 -> h2 778 1.1 christos 779 1.1 christos add v19.2d,v19.2d,v29.2d 780 1.1 christos shl v29.2d,v29.2d,#2 781 1.1 christos ushr v30.2d,v21.2d,#26 782 1.1 christos and v21.16b,v21.16b,v31.16b 783 1.1 christos add v19.2d,v19.2d,v29.2d // h4 -> h0 784 1.1 christos add v22.2d,v22.2d,v30.2d // h2 -> h3 785 1.1 christos 786 1.1 christos ushr v29.2d,v19.2d,#26 787 1.1 christos and v19.16b,v19.16b,v31.16b 788 1.1 christos ushr v30.2d,v22.2d,#26 789 1.1 christos and v22.16b,v22.16b,v31.16b 790 1.1 christos add v20.2d,v20.2d,v29.2d // h0 -> h1 791 1.1 christos add v23.2d,v23.2d,v30.2d // h3 -> h4 792 1.1 christos 793 1.1 christos //////////////////////////////////////////////////////////////// 794 1.1 christos // write the result, can be partially reduced 795 1.1 christos 796 1.1 christos st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16 797 1.1 christos st1 {v23.s}[0],[x0] 798 1.1 christos 799 1.1 christos .Lno_data_neon: 800 1.3 christos ldr x29,[sp],#80 801 1.2 christos .inst 0xd50323bf // autiasp 802 1.1 christos ret 803 1.1 christos .size poly1305_blocks_neon,.-poly1305_blocks_neon 804 1.1 christos 805 1.1 christos .type poly1305_emit_neon,%function 806 1.1 christos .align 5 807 1.1 christos poly1305_emit_neon: 808 1.4 christos .Lpoly1305_emit_neon: 809 1.1 christos ldr x17,[x0,#24] 810 1.1 christos cbz x17,poly1305_emit 811 1.1 christos 812 1.1 christos ldp w10,w11,[x0] // load hash value base 2^26 813 1.1 christos ldp w12,w13,[x0,#8] 814 1.1 christos ldr w14,[x0,#16] 815 1.1 christos 816 1.1 christos add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 817 1.1 christos lsr x5,x12,#12 818 1.1 christos adds x4,x4,x12,lsl#52 819 1.1 christos add x5,x5,x13,lsl#14 820 1.1 christos adc x5,x5,xzr 821 1.1 christos lsr x6,x14,#24 822 1.1 christos adds x5,x5,x14,lsl#40 823 1.1 christos adc x6,x6,xzr // can be partially reduced... 824 1.1 christos 825 1.1 christos ldp x10,x11,[x2] // load nonce 826 1.1 christos 827 1.1 christos and x12,x6,#-4 // ... so reduce 828 1.1 christos add x12,x12,x6,lsr#2 829 1.1 christos and x6,x6,#3 830 1.1 christos adds x4,x4,x12 831 1.1 christos adcs x5,x5,xzr 832 1.1 christos adc x6,x6,xzr 833 1.1 christos 834 1.1 christos adds x12,x4,#5 // compare to modulus 835 1.1 christos adcs x13,x5,xzr 836 1.1 christos adc x14,x6,xzr 837 1.1 christos 838 1.1 christos tst x14,#-4 // see if it's carried/borrowed 839 1.1 christos 840 1.1 christos csel x4,x4,x12,eq 841 1.1 christos csel x5,x5,x13,eq 842 1.1 christos 843 1.5 christos #ifdef __AARCH64EB__ 844 1.1 christos ror x10,x10,#32 // flip nonce words 845 1.1 christos ror x11,x11,#32 846 1.1 christos #endif 847 1.1 christos adds x4,x4,x10 // accumulate nonce 848 1.1 christos adc x5,x5,x11 849 1.5 christos #ifdef __AARCH64EB__ 850 1.1 christos rev x4,x4 // flip output bytes 851 1.1 christos rev x5,x5 852 1.1 christos #endif 853 1.1 christos stp x4,x5,[x1] // write result 854 1.1 christos 855 1.1 christos ret 856 1.1 christos .size poly1305_emit_neon,.-poly1305_emit_neon 857 1.1 christos 858 1.1 christos .align 5 859 1.1 christos .Lzeros: 860 1.1 christos .long 0,0,0,0,0,0,0,0 861 1.1 christos .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 862 1.1 christos .align 2 863 1.1 christos .align 2 864