1 1.1 christos #include "arm_arch.h" 2 1.1 christos 3 1.1 christos .text 4 1.1 christos 5 1.1 christos // forward "declarations" are required for Apple 6 1.1 christos 7 1.1 christos .hidden OPENSSL_armcap_P 8 1.1 christos .globl poly1305_init 9 1.1 christos .hidden poly1305_init 10 1.1 christos .globl poly1305_blocks 11 1.1 christos .hidden poly1305_blocks 12 1.1 christos .globl poly1305_emit 13 1.1 christos .hidden poly1305_emit 14 1.1 christos 15 1.1 christos .type poly1305_init,%function 16 1.1 christos .align 5 17 1.1 christos poly1305_init: 18 1.1 christos cmp x1,xzr 19 1.1 christos stp xzr,xzr,[x0] // zero hash value 20 1.1 christos stp xzr,xzr,[x0,#16] // [along with is_base2_26] 21 1.1 christos 22 1.1 christos csel x0,xzr,x0,eq 23 1.1 christos b.eq .Lno_key 24 1.1 christos 25 1.1 christos #ifdef __ILP32__ 26 1.1 christos ldrsw x11,.LOPENSSL_armcap_P 27 1.1 christos #else 28 1.1 christos ldr x11,.LOPENSSL_armcap_P 29 1.1 christos #endif 30 1.1 christos adr x10,.LOPENSSL_armcap_P 31 1.1 christos 32 1.1 christos ldp x7,x8,[x1] // load key 33 1.1 christos mov x9,#0xfffffffc0fffffff 34 1.1 christos movk x9,#0x0fff,lsl#48 35 1.1 christos ldr w17,[x10,x11] 36 1.1 christos #ifdef __ARMEB__ 37 1.1 christos rev x7,x7 // flip bytes 38 1.1 christos rev x8,x8 39 1.1 christos #endif 40 1.1 christos and x7,x7,x9 // &=0ffffffc0fffffff 41 1.1 christos and x9,x9,#-4 42 1.1 christos and x8,x8,x9 // &=0ffffffc0ffffffc 43 1.1 christos stp x7,x8,[x0,#32] // save key value 44 1.1 christos 45 1.1 christos tst w17,#ARMV7_NEON 46 1.1 christos 47 1.1 christos adr x12,poly1305_blocks 48 1.1 christos adr x7,poly1305_blocks_neon 49 1.1 christos adr x13,poly1305_emit 50 1.1 christos adr x8,poly1305_emit_neon 51 1.1 christos 52 1.1 christos csel x12,x12,x7,eq 53 1.1 christos csel x13,x13,x8,eq 54 1.1 christos 55 1.1 christos #ifdef __ILP32__ 56 1.1 christos stp w12,w13,[x2] 57 1.1 christos #else 58 1.1 christos stp x12,x13,[x2] 59 1.1 christos #endif 60 1.1 christos 61 1.1 christos mov x0,#1 62 1.1 christos .Lno_key: 63 1.1 christos ret 64 1.1 christos .size poly1305_init,.-poly1305_init 65 1.1 christos 66 1.1 christos .type poly1305_blocks,%function 67 1.1 christos .align 5 68 1.1 christos poly1305_blocks: 69 1.1 christos ands x2,x2,#-16 70 1.1 christos b.eq .Lno_data 71 1.1 christos 72 1.1 christos ldp x4,x5,[x0] // load hash value 73 1.1 christos ldp x7,x8,[x0,#32] // load key value 74 1.1 christos ldr x6,[x0,#16] 75 1.1 christos add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 76 1.1 christos b .Loop 77 1.1 christos 78 1.1 christos .align 5 79 1.1 christos .Loop: 80 1.1 christos ldp x10,x11,[x1],#16 // load input 81 1.1 christos sub x2,x2,#16 82 1.1 christos #ifdef __ARMEB__ 83 1.1 christos rev x10,x10 84 1.1 christos rev x11,x11 85 1.1 christos #endif 86 1.1 christos adds x4,x4,x10 // accumulate input 87 1.1 christos adcs x5,x5,x11 88 1.1 christos 89 1.1 christos mul x12,x4,x7 // h0*r0 90 1.1 christos adc x6,x6,x3 91 1.1 christos umulh x13,x4,x7 92 1.1 christos 93 1.1 christos mul x10,x5,x9 // h1*5*r1 94 1.1 christos umulh x11,x5,x9 95 1.1 christos 96 1.1 christos adds x12,x12,x10 97 1.1 christos mul x10,x4,x8 // h0*r1 98 1.1 christos adc x13,x13,x11 99 1.1 christos umulh x14,x4,x8 100 1.1 christos 101 1.1 christos adds x13,x13,x10 102 1.1 christos mul x10,x5,x7 // h1*r0 103 1.1 christos adc x14,x14,xzr 104 1.1 christos umulh x11,x5,x7 105 1.1 christos 106 1.1 christos adds x13,x13,x10 107 1.1 christos mul x10,x6,x9 // h2*5*r1 108 1.1 christos adc x14,x14,x11 109 1.1 christos mul x11,x6,x7 // h2*r0 110 1.1 christos 111 1.1 christos adds x13,x13,x10 112 1.1 christos adc x14,x14,x11 113 1.1 christos 114 1.1 christos and x10,x14,#-4 // final reduction 115 1.1 christos and x6,x14,#3 116 1.1 christos add x10,x10,x14,lsr#2 117 1.1 christos adds x4,x12,x10 118 1.1 christos adcs x5,x13,xzr 119 1.1 christos adc x6,x6,xzr 120 1.1 christos 121 1.1 christos cbnz x2,.Loop 122 1.1 christos 123 1.1 christos stp x4,x5,[x0] // store hash value 124 1.1 christos str x6,[x0,#16] 125 1.1 christos 126 1.1 christos .Lno_data: 127 1.1 christos ret 128 1.1 christos .size poly1305_blocks,.-poly1305_blocks 129 1.1 christos 130 1.1 christos .type poly1305_emit,%function 131 1.1 christos .align 5 132 1.1 christos poly1305_emit: 133 1.1 christos ldp x4,x5,[x0] // load hash base 2^64 134 1.1 christos ldr x6,[x0,#16] 135 1.1 christos ldp x10,x11,[x2] // load nonce 136 1.1 christos 137 1.1 christos adds x12,x4,#5 // compare to modulus 138 1.1 christos adcs x13,x5,xzr 139 1.1 christos adc x14,x6,xzr 140 1.1 christos 141 1.1 christos tst x14,#-4 // see if it's carried/borrowed 142 1.1 christos 143 1.1 christos csel x4,x4,x12,eq 144 1.1 christos csel x5,x5,x13,eq 145 1.1 christos 146 1.1 christos #ifdef __ARMEB__ 147 1.1 christos ror x10,x10,#32 // flip nonce words 148 1.1 christos ror x11,x11,#32 149 1.1 christos #endif 150 1.1 christos adds x4,x4,x10 // accumulate nonce 151 1.1 christos adc x5,x5,x11 152 1.1 christos #ifdef __ARMEB__ 153 1.1 christos rev x4,x4 // flip output bytes 154 1.1 christos rev x5,x5 155 1.1 christos #endif 156 1.1 christos stp x4,x5,[x1] // write result 157 1.1 christos 158 1.1 christos ret 159 1.1 christos .size poly1305_emit,.-poly1305_emit 160 1.1 christos .type poly1305_mult,%function 161 1.1 christos .align 5 162 1.1 christos poly1305_mult: 163 1.1 christos mul x12,x4,x7 // h0*r0 164 1.1 christos umulh x13,x4,x7 165 1.1 christos 166 1.1 christos mul x10,x5,x9 // h1*5*r1 167 1.1 christos umulh x11,x5,x9 168 1.1 christos 169 1.1 christos adds x12,x12,x10 170 1.1 christos mul x10,x4,x8 // h0*r1 171 1.1 christos adc x13,x13,x11 172 1.1 christos umulh x14,x4,x8 173 1.1 christos 174 1.1 christos adds x13,x13,x10 175 1.1 christos mul x10,x5,x7 // h1*r0 176 1.1 christos adc x14,x14,xzr 177 1.1 christos umulh x11,x5,x7 178 1.1 christos 179 1.1 christos adds x13,x13,x10 180 1.1 christos mul x10,x6,x9 // h2*5*r1 181 1.1 christos adc x14,x14,x11 182 1.1 christos mul x11,x6,x7 // h2*r0 183 1.1 christos 184 1.1 christos adds x13,x13,x10 185 1.1 christos adc x14,x14,x11 186 1.1 christos 187 1.1 christos and x10,x14,#-4 // final reduction 188 1.1 christos and x6,x14,#3 189 1.1 christos add x10,x10,x14,lsr#2 190 1.1 christos adds x4,x12,x10 191 1.1 christos adcs x5,x13,xzr 192 1.1 christos adc x6,x6,xzr 193 1.1 christos 194 1.1 christos ret 195 1.1 christos .size poly1305_mult,.-poly1305_mult 196 1.1 christos 197 1.1 christos .type poly1305_splat,%function 198 1.1 christos .align 5 199 1.1 christos poly1305_splat: 200 1.1 christos and x12,x4,#0x03ffffff // base 2^64 -> base 2^26 201 1.1 christos ubfx x13,x4,#26,#26 202 1.1 christos extr x14,x5,x4,#52 203 1.1 christos and x14,x14,#0x03ffffff 204 1.1 christos ubfx x15,x5,#14,#26 205 1.1 christos extr x16,x6,x5,#40 206 1.1 christos 207 1.1 christos str w12,[x0,#16*0] // r0 208 1.1 christos add w12,w13,w13,lsl#2 // r1*5 209 1.1 christos str w13,[x0,#16*1] // r1 210 1.1 christos add w13,w14,w14,lsl#2 // r2*5 211 1.1 christos str w12,[x0,#16*2] // s1 212 1.1 christos str w14,[x0,#16*3] // r2 213 1.1 christos add w14,w15,w15,lsl#2 // r3*5 214 1.1 christos str w13,[x0,#16*4] // s2 215 1.1 christos str w15,[x0,#16*5] // r3 216 1.1 christos add w15,w16,w16,lsl#2 // r4*5 217 1.1 christos str w14,[x0,#16*6] // s3 218 1.1 christos str w16,[x0,#16*7] // r4 219 1.1 christos str w15,[x0,#16*8] // s4 220 1.1 christos 221 1.1 christos ret 222 1.1 christos .size poly1305_splat,.-poly1305_splat 223 1.1 christos 224 1.1 christos .type poly1305_blocks_neon,%function 225 1.1 christos .align 5 226 1.1 christos poly1305_blocks_neon: 227 1.1 christos ldr x17,[x0,#24] 228 1.1 christos cmp x2,#128 229 1.1 christos b.hs .Lblocks_neon 230 1.1 christos cbz x17,poly1305_blocks 231 1.1 christos 232 1.1 christos .Lblocks_neon: 233 1.1 christos .inst 0xd503233f // paciasp 234 1.1 christos stp x29,x30,[sp,#-80]! 235 1.1 christos add x29,sp,#0 236 1.1 christos 237 1.1 christos ands x2,x2,#-16 238 1.1 christos b.eq .Lno_data_neon 239 1.1 christos 240 1.1 christos cbz x17,.Lbase2_64_neon 241 1.1 christos 242 1.1 christos ldp w10,w11,[x0] // load hash value base 2^26 243 1.1 christos ldp w12,w13,[x0,#8] 244 1.1 christos ldr w14,[x0,#16] 245 1.1 christos 246 1.1 christos tst x2,#31 247 1.1 christos b.eq .Leven_neon 248 1.1 christos 249 1.1 christos ldp x7,x8,[x0,#32] // load key value 250 1.1 christos 251 1.1 christos add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 252 1.1 christos lsr x5,x12,#12 253 1.1 christos adds x4,x4,x12,lsl#52 254 1.1 christos add x5,x5,x13,lsl#14 255 1.1 christos adc x5,x5,xzr 256 1.1 christos lsr x6,x14,#24 257 1.1 christos adds x5,x5,x14,lsl#40 258 1.1 christos adc x14,x6,xzr // can be partially reduced... 259 1.1 christos 260 1.1 christos ldp x12,x13,[x1],#16 // load input 261 1.1 christos sub x2,x2,#16 262 1.1 christos add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 263 1.1 christos 264 1.1 christos and x10,x14,#-4 // ... so reduce 265 1.1 christos and x6,x14,#3 266 1.1 christos add x10,x10,x14,lsr#2 267 1.1 christos adds x4,x4,x10 268 1.1 christos adcs x5,x5,xzr 269 1.1 christos adc x6,x6,xzr 270 1.1 christos 271 1.1 christos #ifdef __ARMEB__ 272 1.1 christos rev x12,x12 273 1.1 christos rev x13,x13 274 1.1 christos #endif 275 1.1 christos adds x4,x4,x12 // accumulate input 276 1.1 christos adcs x5,x5,x13 277 1.1 christos adc x6,x6,x3 278 1.1 christos 279 1.1 christos bl poly1305_mult 280 1.1 christos ldr x30,[sp,#8] 281 1.1 christos 282 1.1 christos cbz x3,.Lstore_base2_64_neon 283 1.1 christos 284 1.1 christos and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 285 1.1 christos ubfx x11,x4,#26,#26 286 1.1 christos extr x12,x5,x4,#52 287 1.1 christos and x12,x12,#0x03ffffff 288 1.1 christos ubfx x13,x5,#14,#26 289 1.1 christos extr x14,x6,x5,#40 290 1.1 christos 291 1.1 christos cbnz x2,.Leven_neon 292 1.1 christos 293 1.1 christos stp w10,w11,[x0] // store hash value base 2^26 294 1.1 christos stp w12,w13,[x0,#8] 295 1.1 christos str w14,[x0,#16] 296 1.1 christos b .Lno_data_neon 297 1.1 christos 298 1.1 christos .align 4 299 1.1 christos .Lstore_base2_64_neon: 300 1.1 christos stp x4,x5,[x0] // store hash value base 2^64 301 1.1 christos stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed 302 1.1 christos b .Lno_data_neon 303 1.1 christos 304 1.1 christos .align 4 305 1.1 christos .Lbase2_64_neon: 306 1.1 christos ldp x7,x8,[x0,#32] // load key value 307 1.1 christos 308 1.1 christos ldp x4,x5,[x0] // load hash value base 2^64 309 1.1 christos ldr x6,[x0,#16] 310 1.1 christos 311 1.1 christos tst x2,#31 312 1.1 christos b.eq .Linit_neon 313 1.1 christos 314 1.1 christos ldp x12,x13,[x1],#16 // load input 315 1.1 christos sub x2,x2,#16 316 1.1 christos add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 317 1.1 christos #ifdef __ARMEB__ 318 1.1 christos rev x12,x12 319 1.1 christos rev x13,x13 320 1.1 christos #endif 321 1.1 christos adds x4,x4,x12 // accumulate input 322 1.1 christos adcs x5,x5,x13 323 1.1 christos adc x6,x6,x3 324 1.1 christos 325 1.1 christos bl poly1305_mult 326 1.1 christos 327 1.1 christos .Linit_neon: 328 1.1 christos and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 329 1.1 christos ubfx x11,x4,#26,#26 330 1.1 christos extr x12,x5,x4,#52 331 1.1 christos and x12,x12,#0x03ffffff 332 1.1 christos ubfx x13,x5,#14,#26 333 1.1 christos extr x14,x6,x5,#40 334 1.1 christos 335 1.1 christos stp d8,d9,[sp,#16] // meet ABI requirements 336 1.1 christos stp d10,d11,[sp,#32] 337 1.1 christos stp d12,d13,[sp,#48] 338 1.1 christos stp d14,d15,[sp,#64] 339 1.1 christos 340 1.1 christos fmov d24,x10 341 1.1 christos fmov d25,x11 342 1.1 christos fmov d26,x12 343 1.1 christos fmov d27,x13 344 1.1 christos fmov d28,x14 345 1.1 christos 346 1.1 christos ////////////////////////////////// initialize r^n table 347 1.1 christos mov x4,x7 // r^1 348 1.1 christos add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 349 1.1 christos mov x5,x8 350 1.1 christos mov x6,xzr 351 1.1 christos add x0,x0,#48+12 352 1.1 christos bl poly1305_splat 353 1.1 christos 354 1.1 christos bl poly1305_mult // r^2 355 1.1 christos sub x0,x0,#4 356 1.1 christos bl poly1305_splat 357 1.1 christos 358 1.1 christos bl poly1305_mult // r^3 359 1.1 christos sub x0,x0,#4 360 1.1 christos bl poly1305_splat 361 1.1 christos 362 1.1 christos bl poly1305_mult // r^4 363 1.1 christos sub x0,x0,#4 364 1.1 christos bl poly1305_splat 365 1.1 christos ldr x30,[sp,#8] 366 1.1 christos 367 1.1 christos add x16,x1,#32 368 1.1 christos adr x17,.Lzeros 369 1.1 christos subs x2,x2,#64 370 1.1 christos csel x16,x17,x16,lo 371 1.1 christos 372 1.1 christos mov x4,#1 373 1.1 christos str x4,[x0,#-24] // set is_base2_26 374 1.1 christos sub x0,x0,#48 // restore original x0 375 1.1 christos b .Ldo_neon 376 1.1 christos 377 1.1 christos .align 4 378 1.1 christos .Leven_neon: 379 1.1 christos add x16,x1,#32 380 1.1 christos adr x17,.Lzeros 381 1.1 christos subs x2,x2,#64 382 1.1 christos csel x16,x17,x16,lo 383 1.1 christos 384 1.1 christos stp d8,d9,[sp,#16] // meet ABI requirements 385 1.1 christos stp d10,d11,[sp,#32] 386 1.1 christos stp d12,d13,[sp,#48] 387 1.1 christos stp d14,d15,[sp,#64] 388 1.1 christos 389 1.1 christos fmov d24,x10 390 1.1 christos fmov d25,x11 391 1.1 christos fmov d26,x12 392 1.1 christos fmov d27,x13 393 1.1 christos fmov d28,x14 394 1.1 christos 395 1.1 christos .Ldo_neon: 396 1.1 christos ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 397 1.1 christos ldp x9,x13,[x16],#48 398 1.1 christos 399 1.1 christos lsl x3,x3,#24 400 1.1 christos add x15,x0,#48 401 1.1 christos 402 1.1 christos #ifdef __ARMEB__ 403 1.1 christos rev x8,x8 404 1.1 christos rev x12,x12 405 1.1 christos rev x9,x9 406 1.1 christos rev x13,x13 407 1.1 christos #endif 408 1.1 christos and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 409 1.1 christos and x5,x9,#0x03ffffff 410 1.1 christos ubfx x6,x8,#26,#26 411 1.1 christos ubfx x7,x9,#26,#26 412 1.1 christos add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 413 1.1 christos extr x8,x12,x8,#52 414 1.1 christos extr x9,x13,x9,#52 415 1.1 christos add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 416 1.1 christos fmov d14,x4 417 1.1 christos and x8,x8,#0x03ffffff 418 1.1 christos and x9,x9,#0x03ffffff 419 1.1 christos ubfx x10,x12,#14,#26 420 1.1 christos ubfx x11,x13,#14,#26 421 1.1 christos add x12,x3,x12,lsr#40 422 1.1 christos add x13,x3,x13,lsr#40 423 1.1 christos add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 424 1.1 christos fmov d15,x6 425 1.1 christos add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 426 1.1 christos add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 427 1.1 christos fmov d16,x8 428 1.1 christos fmov d17,x10 429 1.1 christos fmov d18,x12 430 1.1 christos 431 1.1 christos ldp x8,x12,[x1],#16 // inp[0:1] 432 1.1 christos ldp x9,x13,[x1],#48 433 1.1 christos 434 1.1 christos ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64 435 1.1 christos ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64 436 1.1 christos ld1 {v8.4s},[x15] 437 1.1 christos 438 1.1 christos #ifdef __ARMEB__ 439 1.1 christos rev x8,x8 440 1.1 christos rev x12,x12 441 1.1 christos rev x9,x9 442 1.1 christos rev x13,x13 443 1.1 christos #endif 444 1.1 christos and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 445 1.1 christos and x5,x9,#0x03ffffff 446 1.1 christos ubfx x6,x8,#26,#26 447 1.1 christos ubfx x7,x9,#26,#26 448 1.1 christos add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 449 1.1 christos extr x8,x12,x8,#52 450 1.1 christos extr x9,x13,x9,#52 451 1.1 christos add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 452 1.1 christos fmov d9,x4 453 1.1 christos and x8,x8,#0x03ffffff 454 1.1 christos and x9,x9,#0x03ffffff 455 1.1 christos ubfx x10,x12,#14,#26 456 1.1 christos ubfx x11,x13,#14,#26 457 1.1 christos add x12,x3,x12,lsr#40 458 1.1 christos add x13,x3,x13,lsr#40 459 1.1 christos add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 460 1.1 christos fmov d10,x6 461 1.1 christos add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 462 1.1 christos add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 463 1.1 christos movi v31.2d,#-1 464 1.1 christos fmov d11,x8 465 1.1 christos fmov d12,x10 466 1.1 christos fmov d13,x12 467 1.1 christos ushr v31.2d,v31.2d,#38 468 1.1 christos 469 1.1 christos b.ls .Lskip_loop 470 1.1 christos 471 1.1 christos .align 4 472 1.1 christos .Loop_neon: 473 1.1 christos //////////////////////////////////////////////////////////////// 474 1.1 christos // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 475 1.1 christos // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 476 1.1 christos // ___________________/ 477 1.1 christos // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 478 1.1 christos // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 479 1.1 christos // ___________________/ ____________________/ 480 1.1 christos // 481 1.1 christos // Note that we start with inp[2:3]*r^2. This is because it 482 1.1 christos // doesn't depend on reduction in previous iteration. 483 1.1 christos //////////////////////////////////////////////////////////////// 484 1.1 christos // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 485 1.1 christos // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 486 1.1 christos // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 487 1.1 christos // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 488 1.1 christos // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 489 1.1 christos 490 1.1 christos subs x2,x2,#64 491 1.1 christos umull v23.2d,v14.2s,v7.s[2] 492 1.1 christos csel x16,x17,x16,lo 493 1.1 christos umull v22.2d,v14.2s,v5.s[2] 494 1.1 christos umull v21.2d,v14.2s,v3.s[2] 495 1.1 christos ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 496 1.1 christos umull v20.2d,v14.2s,v1.s[2] 497 1.1 christos ldp x9,x13,[x16],#48 498 1.1 christos umull v19.2d,v14.2s,v0.s[2] 499 1.1 christos #ifdef __ARMEB__ 500 1.1 christos rev x8,x8 501 1.1 christos rev x12,x12 502 1.1 christos rev x9,x9 503 1.1 christos rev x13,x13 504 1.1 christos #endif 505 1.1 christos 506 1.1 christos umlal v23.2d,v15.2s,v5.s[2] 507 1.1 christos and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 508 1.1 christos umlal v22.2d,v15.2s,v3.s[2] 509 1.1 christos and x5,x9,#0x03ffffff 510 1.1 christos umlal v21.2d,v15.2s,v1.s[2] 511 1.1 christos ubfx x6,x8,#26,#26 512 1.1 christos umlal v20.2d,v15.2s,v0.s[2] 513 1.1 christos ubfx x7,x9,#26,#26 514 1.1 christos umlal v19.2d,v15.2s,v8.s[2] 515 1.1 christos add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 516 1.1 christos 517 1.1 christos umlal v23.2d,v16.2s,v3.s[2] 518 1.1 christos extr x8,x12,x8,#52 519 1.1 christos umlal v22.2d,v16.2s,v1.s[2] 520 1.1 christos extr x9,x13,x9,#52 521 1.1 christos umlal v21.2d,v16.2s,v0.s[2] 522 1.1 christos add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 523 1.1 christos umlal v20.2d,v16.2s,v8.s[2] 524 1.1 christos fmov d14,x4 525 1.1 christos umlal v19.2d,v16.2s,v6.s[2] 526 1.1 christos and x8,x8,#0x03ffffff 527 1.1 christos 528 1.1 christos umlal v23.2d,v17.2s,v1.s[2] 529 1.1 christos and x9,x9,#0x03ffffff 530 1.1 christos umlal v22.2d,v17.2s,v0.s[2] 531 1.1 christos ubfx x10,x12,#14,#26 532 1.1 christos umlal v21.2d,v17.2s,v8.s[2] 533 1.1 christos ubfx x11,x13,#14,#26 534 1.1 christos umlal v20.2d,v17.2s,v6.s[2] 535 1.1 christos add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 536 1.1 christos umlal v19.2d,v17.2s,v4.s[2] 537 1.1 christos fmov d15,x6 538 1.1 christos 539 1.1 christos add v11.2s,v11.2s,v26.2s 540 1.1 christos add x12,x3,x12,lsr#40 541 1.1 christos umlal v23.2d,v18.2s,v0.s[2] 542 1.1 christos add x13,x3,x13,lsr#40 543 1.1 christos umlal v22.2d,v18.2s,v8.s[2] 544 1.1 christos add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 545 1.1 christos umlal v21.2d,v18.2s,v6.s[2] 546 1.1 christos add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 547 1.1 christos umlal v20.2d,v18.2s,v4.s[2] 548 1.1 christos fmov d16,x8 549 1.1 christos umlal v19.2d,v18.2s,v2.s[2] 550 1.1 christos fmov d17,x10 551 1.1 christos 552 1.1 christos //////////////////////////////////////////////////////////////// 553 1.1 christos // (hash+inp[0:1])*r^4 and accumulate 554 1.1 christos 555 1.1 christos add v9.2s,v9.2s,v24.2s 556 1.1 christos fmov d18,x12 557 1.1 christos umlal v22.2d,v11.2s,v1.s[0] 558 1.1 christos ldp x8,x12,[x1],#16 // inp[0:1] 559 1.1 christos umlal v19.2d,v11.2s,v6.s[0] 560 1.1 christos ldp x9,x13,[x1],#48 561 1.1 christos umlal v23.2d,v11.2s,v3.s[0] 562 1.1 christos umlal v20.2d,v11.2s,v8.s[0] 563 1.1 christos umlal v21.2d,v11.2s,v0.s[0] 564 1.1 christos #ifdef __ARMEB__ 565 1.1 christos rev x8,x8 566 1.1 christos rev x12,x12 567 1.1 christos rev x9,x9 568 1.1 christos rev x13,x13 569 1.1 christos #endif 570 1.1 christos 571 1.1 christos add v10.2s,v10.2s,v25.2s 572 1.1 christos umlal v22.2d,v9.2s,v5.s[0] 573 1.1 christos umlal v23.2d,v9.2s,v7.s[0] 574 1.1 christos and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 575 1.1 christos umlal v21.2d,v9.2s,v3.s[0] 576 1.1 christos and x5,x9,#0x03ffffff 577 1.1 christos umlal v19.2d,v9.2s,v0.s[0] 578 1.1 christos ubfx x6,x8,#26,#26 579 1.1 christos umlal v20.2d,v9.2s,v1.s[0] 580 1.1 christos ubfx x7,x9,#26,#26 581 1.1 christos 582 1.1 christos add v12.2s,v12.2s,v27.2s 583 1.1 christos add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 584 1.1 christos umlal v22.2d,v10.2s,v3.s[0] 585 1.1 christos extr x8,x12,x8,#52 586 1.1 christos umlal v23.2d,v10.2s,v5.s[0] 587 1.1 christos extr x9,x13,x9,#52 588 1.1 christos umlal v19.2d,v10.2s,v8.s[0] 589 1.1 christos add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 590 1.1 christos umlal v21.2d,v10.2s,v1.s[0] 591 1.1 christos fmov d9,x4 592 1.1 christos umlal v20.2d,v10.2s,v0.s[0] 593 1.1 christos and x8,x8,#0x03ffffff 594 1.1 christos 595 1.1 christos add v13.2s,v13.2s,v28.2s 596 1.1 christos and x9,x9,#0x03ffffff 597 1.1 christos umlal v22.2d,v12.2s,v0.s[0] 598 1.1 christos ubfx x10,x12,#14,#26 599 1.1 christos umlal v19.2d,v12.2s,v4.s[0] 600 1.1 christos ubfx x11,x13,#14,#26 601 1.1 christos umlal v23.2d,v12.2s,v1.s[0] 602 1.1 christos add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 603 1.1 christos umlal v20.2d,v12.2s,v6.s[0] 604 1.1 christos fmov d10,x6 605 1.1 christos umlal v21.2d,v12.2s,v8.s[0] 606 1.1 christos add x12,x3,x12,lsr#40 607 1.1 christos 608 1.1 christos umlal v22.2d,v13.2s,v8.s[0] 609 1.1 christos add x13,x3,x13,lsr#40 610 1.1 christos umlal v19.2d,v13.2s,v2.s[0] 611 1.1 christos add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 612 1.1 christos umlal v23.2d,v13.2s,v0.s[0] 613 1.1 christos add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 614 1.1 christos umlal v20.2d,v13.2s,v4.s[0] 615 1.1 christos fmov d11,x8 616 1.1 christos umlal v21.2d,v13.2s,v6.s[0] 617 1.1 christos fmov d12,x10 618 1.1 christos fmov d13,x12 619 1.1 christos 620 1.1 christos ///////////////////////////////////////////////////////////////// 621 1.1 christos // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 622 1.1 christos // and P. Schwabe 623 1.1 christos // 624 1.1 christos // [see discussion in poly1305-armv4 module] 625 1.1 christos 626 1.1 christos ushr v29.2d,v22.2d,#26 627 1.1 christos xtn v27.2s,v22.2d 628 1.1 christos ushr v30.2d,v19.2d,#26 629 1.1 christos and v19.16b,v19.16b,v31.16b 630 1.1 christos add v23.2d,v23.2d,v29.2d // h3 -> h4 631 1.1 christos bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff 632 1.1 christos add v20.2d,v20.2d,v30.2d // h0 -> h1 633 1.1 christos 634 1.1 christos ushr v29.2d,v23.2d,#26 635 1.1 christos xtn v28.2s,v23.2d 636 1.1 christos ushr v30.2d,v20.2d,#26 637 1.1 christos xtn v25.2s,v20.2d 638 1.1 christos bic v28.2s,#0xfc,lsl#24 639 1.1 christos add v21.2d,v21.2d,v30.2d // h1 -> h2 640 1.1 christos 641 1.1 christos add v19.2d,v19.2d,v29.2d 642 1.1 christos shl v29.2d,v29.2d,#2 643 1.1 christos shrn v30.2s,v21.2d,#26 644 1.1 christos xtn v26.2s,v21.2d 645 1.1 christos add v19.2d,v19.2d,v29.2d // h4 -> h0 646 1.1 christos bic v25.2s,#0xfc,lsl#24 647 1.1 christos add v27.2s,v27.2s,v30.2s // h2 -> h3 648 1.1 christos bic v26.2s,#0xfc,lsl#24 649 1.1 christos 650 1.1 christos shrn v29.2s,v19.2d,#26 651 1.1 christos xtn v24.2s,v19.2d 652 1.1 christos ushr v30.2s,v27.2s,#26 653 1.1 christos bic v27.2s,#0xfc,lsl#24 654 1.1 christos bic v24.2s,#0xfc,lsl#24 655 1.1 christos add v25.2s,v25.2s,v29.2s // h0 -> h1 656 1.1 christos add v28.2s,v28.2s,v30.2s // h3 -> h4 657 1.1 christos 658 1.1 christos b.hi .Loop_neon 659 1.1 christos 660 1.1 christos .Lskip_loop: 661 1.1 christos dup v16.2d,v16.d[0] 662 1.1 christos add v11.2s,v11.2s,v26.2s 663 1.1 christos 664 1.1 christos //////////////////////////////////////////////////////////////// 665 1.1 christos // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 666 1.1 christos 667 1.1 christos adds x2,x2,#32 668 1.1 christos b.ne .Long_tail 669 1.1 christos 670 1.1 christos dup v16.2d,v11.d[0] 671 1.1 christos add v14.2s,v9.2s,v24.2s 672 1.1 christos add v17.2s,v12.2s,v27.2s 673 1.1 christos add v15.2s,v10.2s,v25.2s 674 1.1 christos add v18.2s,v13.2s,v28.2s 675 1.1 christos 676 1.1 christos .Long_tail: 677 1.1 christos dup v14.2d,v14.d[0] 678 1.1 christos umull2 v19.2d,v16.4s,v6.4s 679 1.1 christos umull2 v22.2d,v16.4s,v1.4s 680 1.1 christos umull2 v23.2d,v16.4s,v3.4s 681 1.1 christos umull2 v21.2d,v16.4s,v0.4s 682 1.1 christos umull2 v20.2d,v16.4s,v8.4s 683 1.1 christos 684 1.1 christos dup v15.2d,v15.d[0] 685 1.1 christos umlal2 v19.2d,v14.4s,v0.4s 686 1.1 christos umlal2 v21.2d,v14.4s,v3.4s 687 1.1 christos umlal2 v22.2d,v14.4s,v5.4s 688 1.1 christos umlal2 v23.2d,v14.4s,v7.4s 689 1.1 christos umlal2 v20.2d,v14.4s,v1.4s 690 1.1 christos 691 1.1 christos dup v17.2d,v17.d[0] 692 1.1 christos umlal2 v19.2d,v15.4s,v8.4s 693 1.1 christos umlal2 v22.2d,v15.4s,v3.4s 694 1.1 christos umlal2 v21.2d,v15.4s,v1.4s 695 1.1 christos umlal2 v23.2d,v15.4s,v5.4s 696 1.1 christos umlal2 v20.2d,v15.4s,v0.4s 697 1.1 christos 698 1.1 christos dup v18.2d,v18.d[0] 699 1.1 christos umlal2 v22.2d,v17.4s,v0.4s 700 1.1 christos umlal2 v23.2d,v17.4s,v1.4s 701 1.1 christos umlal2 v19.2d,v17.4s,v4.4s 702 1.1 christos umlal2 v20.2d,v17.4s,v6.4s 703 1.1 christos umlal2 v21.2d,v17.4s,v8.4s 704 1.1 christos 705 1.1 christos umlal2 v22.2d,v18.4s,v8.4s 706 1.1 christos umlal2 v19.2d,v18.4s,v2.4s 707 1.1 christos umlal2 v23.2d,v18.4s,v0.4s 708 1.1 christos umlal2 v20.2d,v18.4s,v4.4s 709 1.1 christos umlal2 v21.2d,v18.4s,v6.4s 710 1.1 christos 711 1.1 christos b.eq .Lshort_tail 712 1.1 christos 713 1.1 christos //////////////////////////////////////////////////////////////// 714 1.1 christos // (hash+inp[0:1])*r^4:r^3 and accumulate 715 1.1 christos 716 1.1 christos add v9.2s,v9.2s,v24.2s 717 1.1 christos umlal v22.2d,v11.2s,v1.2s 718 1.1 christos umlal v19.2d,v11.2s,v6.2s 719 1.1 christos umlal v23.2d,v11.2s,v3.2s 720 1.1 christos umlal v20.2d,v11.2s,v8.2s 721 1.1 christos umlal v21.2d,v11.2s,v0.2s 722 1.1 christos 723 1.1 christos add v10.2s,v10.2s,v25.2s 724 1.1 christos umlal v22.2d,v9.2s,v5.2s 725 1.1 christos umlal v19.2d,v9.2s,v0.2s 726 1.1 christos umlal v23.2d,v9.2s,v7.2s 727 1.1 christos umlal v20.2d,v9.2s,v1.2s 728 1.1 christos umlal v21.2d,v9.2s,v3.2s 729 1.1 christos 730 1.1 christos add v12.2s,v12.2s,v27.2s 731 1.1 christos umlal v22.2d,v10.2s,v3.2s 732 1.1 christos umlal v19.2d,v10.2s,v8.2s 733 1.1 christos umlal v23.2d,v10.2s,v5.2s 734 1.1 christos umlal v20.2d,v10.2s,v0.2s 735 1.1 christos umlal v21.2d,v10.2s,v1.2s 736 1.1 christos 737 1.1 christos add v13.2s,v13.2s,v28.2s 738 1.1 christos umlal v22.2d,v12.2s,v0.2s 739 1.1 christos umlal v19.2d,v12.2s,v4.2s 740 1.1 christos umlal v23.2d,v12.2s,v1.2s 741 1.1 christos umlal v20.2d,v12.2s,v6.2s 742 1.1 christos umlal v21.2d,v12.2s,v8.2s 743 1.1 christos 744 1.1 christos umlal v22.2d,v13.2s,v8.2s 745 1.1 christos umlal v19.2d,v13.2s,v2.2s 746 1.1 christos umlal v23.2d,v13.2s,v0.2s 747 1.1 christos umlal v20.2d,v13.2s,v4.2s 748 1.1 christos umlal v21.2d,v13.2s,v6.2s 749 1.1 christos 750 1.1 christos .Lshort_tail: 751 1.1 christos //////////////////////////////////////////////////////////////// 752 1.1 christos // horizontal add 753 1.1 christos 754 1.1 christos addp v22.2d,v22.2d,v22.2d 755 1.1 christos ldp d8,d9,[sp,#16] // meet ABI requirements 756 1.1 christos addp v19.2d,v19.2d,v19.2d 757 1.1 christos ldp d10,d11,[sp,#32] 758 1.1 christos addp v23.2d,v23.2d,v23.2d 759 1.1 christos ldp d12,d13,[sp,#48] 760 1.1 christos addp v20.2d,v20.2d,v20.2d 761 1.1 christos ldp d14,d15,[sp,#64] 762 1.1 christos addp v21.2d,v21.2d,v21.2d 763 1.1 christos 764 1.1 christos //////////////////////////////////////////////////////////////// 765 1.1 christos // lazy reduction, but without narrowing 766 1.1 christos 767 1.1 christos ushr v29.2d,v22.2d,#26 768 1.1 christos and v22.16b,v22.16b,v31.16b 769 1.1 christos ushr v30.2d,v19.2d,#26 770 1.1 christos and v19.16b,v19.16b,v31.16b 771 1.1 christos 772 1.1 christos add v23.2d,v23.2d,v29.2d // h3 -> h4 773 1.1 christos add v20.2d,v20.2d,v30.2d // h0 -> h1 774 1.1 christos 775 1.1 christos ushr v29.2d,v23.2d,#26 776 1.1 christos and v23.16b,v23.16b,v31.16b 777 1.1 christos ushr v30.2d,v20.2d,#26 778 1.1 christos and v20.16b,v20.16b,v31.16b 779 1.1 christos add v21.2d,v21.2d,v30.2d // h1 -> h2 780 1.1 christos 781 1.1 christos add v19.2d,v19.2d,v29.2d 782 1.1 christos shl v29.2d,v29.2d,#2 783 1.1 christos ushr v30.2d,v21.2d,#26 784 1.1 christos and v21.16b,v21.16b,v31.16b 785 1.1 christos add v19.2d,v19.2d,v29.2d // h4 -> h0 786 1.1 christos add v22.2d,v22.2d,v30.2d // h2 -> h3 787 1.1 christos 788 1.1 christos ushr v29.2d,v19.2d,#26 789 1.1 christos and v19.16b,v19.16b,v31.16b 790 1.1 christos ushr v30.2d,v22.2d,#26 791 1.1 christos and v22.16b,v22.16b,v31.16b 792 1.1 christos add v20.2d,v20.2d,v29.2d // h0 -> h1 793 1.1 christos add v23.2d,v23.2d,v30.2d // h3 -> h4 794 1.1 christos 795 1.1 christos //////////////////////////////////////////////////////////////// 796 1.1 christos // write the result, can be partially reduced 797 1.1 christos 798 1.1 christos st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16 799 1.1 christos st1 {v23.s}[0],[x0] 800 1.1 christos 801 1.1 christos .Lno_data_neon: 802 1.1 christos ldr x29,[sp],#80 803 1.1 christos .inst 0xd50323bf // autiasp 804 1.1 christos ret 805 1.1 christos .size poly1305_blocks_neon,.-poly1305_blocks_neon 806 1.1 christos 807 1.1 christos .type poly1305_emit_neon,%function 808 1.1 christos .align 5 809 1.1 christos poly1305_emit_neon: 810 1.1 christos ldr x17,[x0,#24] 811 1.1 christos cbz x17,poly1305_emit 812 1.1 christos 813 1.1 christos ldp w10,w11,[x0] // load hash value base 2^26 814 1.1 christos ldp w12,w13,[x0,#8] 815 1.1 christos ldr w14,[x0,#16] 816 1.1 christos 817 1.1 christos add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 818 1.1 christos lsr x5,x12,#12 819 1.1 christos adds x4,x4,x12,lsl#52 820 1.1 christos add x5,x5,x13,lsl#14 821 1.1 christos adc x5,x5,xzr 822 1.1 christos lsr x6,x14,#24 823 1.1 christos adds x5,x5,x14,lsl#40 824 1.1 christos adc x6,x6,xzr // can be partially reduced... 825 1.1 christos 826 1.1 christos ldp x10,x11,[x2] // load nonce 827 1.1 christos 828 1.1 christos and x12,x6,#-4 // ... so reduce 829 1.1 christos add x12,x12,x6,lsr#2 830 1.1 christos and x6,x6,#3 831 1.1 christos adds x4,x4,x12 832 1.1 christos adcs x5,x5,xzr 833 1.1 christos adc x6,x6,xzr 834 1.1 christos 835 1.1 christos adds x12,x4,#5 // compare to modulus 836 1.1 christos adcs x13,x5,xzr 837 1.1 christos adc x14,x6,xzr 838 1.1 christos 839 1.1 christos tst x14,#-4 // see if it's carried/borrowed 840 1.1 christos 841 1.1 christos csel x4,x4,x12,eq 842 1.1 christos csel x5,x5,x13,eq 843 1.1 christos 844 1.1 christos #ifdef __ARMEB__ 845 1.1 christos ror x10,x10,#32 // flip nonce words 846 1.1 christos ror x11,x11,#32 847 1.1 christos #endif 848 1.1 christos adds x4,x4,x10 // accumulate nonce 849 1.1 christos adc x5,x5,x11 850 1.1 christos #ifdef __ARMEB__ 851 1.1 christos rev x4,x4 // flip output bytes 852 1.1 christos rev x5,x5 853 1.1 christos #endif 854 1.1 christos stp x4,x5,[x1] // write result 855 1.1 christos 856 1.1 christos ret 857 1.1 christos .size poly1305_emit_neon,.-poly1305_emit_neon 858 1.1 christos 859 1.1 christos .align 5 860 1.1 christos .Lzeros: 861 1.1 christos .long 0,0,0,0,0,0,0,0 862 1.1 christos .LOPENSSL_armcap_P: 863 1.1 christos #ifdef __ILP32__ 864 1.1 christos .long OPENSSL_armcap_P-. 865 1.1 christos #else 866 1.1 christos .quad OPENSSL_armcap_P-. 867 1.1 christos #endif 868 1.1 christos .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 869 1.1 christos .align 2 870 1.1 christos .align 2 871