1 1.1 christos #include "arm_asm.h" 2 1.1.1.2 christos #include "arm_arch.h" 3 1.1 christos 4 1.1 christos .text 5 1.1.1.2 christos #if defined(__thumb2__) || defined(__clang__) 6 1.1.1.2 christos .syntax unified 7 1.1.1.2 christos #define ldrplb ldrbpl 8 1.1.1.2 christos #define ldrneb ldrbne 9 1.1.1.2 christos #endif 10 1.1.1.2 christos #if defined(__thumb2__) 11 1.1.1.2 christos .thumb 12 1.1.1.2 christos #else 13 1.1 christos .code 32 14 1.1.1.2 christos #endif 15 1.1 christos 16 1.1 christos .type rem_4bit,%object 17 1.1 christos .align 5 18 1.1 christos rem_4bit: 19 1.1 christos .short 0x0000,0x1C20,0x3840,0x2460 20 1.1 christos .short 0x7080,0x6CA0,0x48C0,0x54E0 21 1.1 christos .short 0xE100,0xFD20,0xD940,0xC560 22 1.1 christos .short 0x9180,0x8DA0,0xA9C0,0xB5E0 23 1.1 christos .size rem_4bit,.-rem_4bit 24 1.1 christos 25 1.1 christos .type rem_4bit_get,%function 26 1.1 christos rem_4bit_get: 27 1.1.1.2 christos #if defined(__thumb2__) 28 1.1.1.2 christos adr r2,rem_4bit 29 1.1.1.2 christos #else 30 1.1.1.2 christos sub r2,pc,#8+32 @ &rem_4bit 31 1.1.1.2 christos #endif 32 1.1 christos b .Lrem_4bit_got 33 1.1 christos nop 34 1.1.1.2 christos nop 35 1.1 christos .size rem_4bit_get,.-rem_4bit_get 36 1.1 christos 37 1.1.1.2 christos .globl gcm_ghash_4bit 38 1.1 christos .type gcm_ghash_4bit,%function 39 1.1.1.2 christos .align 4 40 1.1 christos gcm_ghash_4bit: 41 1.1.1.2 christos #if defined(__thumb2__) 42 1.1.1.2 christos adr r12,rem_4bit 43 1.1.1.2 christos #else 44 1.1.1.2 christos sub r12,pc,#8+48 @ &rem_4bit 45 1.1.1.2 christos #endif 46 1.1 christos add r3,r2,r3 @ r3 to point at the end 47 1.1.1.2 christos stmdb sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} @ save r3/end too 48 1.1 christos 49 1.1.1.2 christos ldmia r12,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy rem_4bit ... 50 1.1.1.2 christos stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ ... to stack 51 1.1 christos 52 1.1 christos ldrb r12,[r2,#15] 53 1.1 christos ldrb r14,[r0,#15] 54 1.1 christos .Louter: 55 1.1 christos eor r12,r12,r14 56 1.1 christos and r14,r12,#0xf0 57 1.1 christos and r12,r12,#0x0f 58 1.1 christos mov r3,#14 59 1.1 christos 60 1.1 christos add r7,r1,r12,lsl#4 61 1.1.1.2 christos ldmia r7,{r4,r5,r6,r7} @ load Htbl[nlo] 62 1.1 christos add r11,r1,r14 63 1.1 christos ldrb r12,[r2,#14] 64 1.1 christos 65 1.1 christos and r14,r4,#0xf @ rem 66 1.1.1.2 christos ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi] 67 1.1 christos add r14,r14,r14 68 1.1 christos eor r4,r8,r4,lsr#4 69 1.1 christos ldrh r8,[sp,r14] @ rem_4bit[rem] 70 1.1 christos eor r4,r4,r5,lsl#28 71 1.1 christos ldrb r14,[r0,#14] 72 1.1 christos eor r5,r9,r5,lsr#4 73 1.1 christos eor r5,r5,r6,lsl#28 74 1.1 christos eor r6,r10,r6,lsr#4 75 1.1 christos eor r6,r6,r7,lsl#28 76 1.1 christos eor r7,r11,r7,lsr#4 77 1.1 christos eor r12,r12,r14 78 1.1 christos and r14,r12,#0xf0 79 1.1 christos and r12,r12,#0x0f 80 1.1 christos eor r7,r7,r8,lsl#16 81 1.1 christos 82 1.1 christos .Linner: 83 1.1 christos add r11,r1,r12,lsl#4 84 1.1 christos and r12,r4,#0xf @ rem 85 1.1 christos subs r3,r3,#1 86 1.1 christos add r12,r12,r12 87 1.1.1.2 christos ldmia r11,{r8,r9,r10,r11} @ load Htbl[nlo] 88 1.1 christos eor r4,r8,r4,lsr#4 89 1.1 christos eor r4,r4,r5,lsl#28 90 1.1 christos eor r5,r9,r5,lsr#4 91 1.1 christos eor r5,r5,r6,lsl#28 92 1.1 christos ldrh r8,[sp,r12] @ rem_4bit[rem] 93 1.1 christos eor r6,r10,r6,lsr#4 94 1.1.1.2 christos #ifdef __thumb2__ 95 1.1.1.2 christos it pl 96 1.1.1.2 christos #endif 97 1.1.1.2 christos ldrplb r12,[r2,r3] 98 1.1 christos eor r6,r6,r7,lsl#28 99 1.1 christos eor r7,r11,r7,lsr#4 100 1.1 christos 101 1.1 christos add r11,r1,r14 102 1.1 christos and r14,r4,#0xf @ rem 103 1.1 christos eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] 104 1.1 christos add r14,r14,r14 105 1.1.1.2 christos ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi] 106 1.1 christos eor r4,r8,r4,lsr#4 107 1.1.1.2 christos #ifdef __thumb2__ 108 1.1.1.2 christos it pl 109 1.1.1.2 christos #endif 110 1.1.1.2 christos ldrplb r8,[r0,r3] 111 1.1 christos eor r4,r4,r5,lsl#28 112 1.1 christos eor r5,r9,r5,lsr#4 113 1.1 christos ldrh r9,[sp,r14] 114 1.1 christos eor r5,r5,r6,lsl#28 115 1.1 christos eor r6,r10,r6,lsr#4 116 1.1 christos eor r6,r6,r7,lsl#28 117 1.1.1.2 christos #ifdef __thumb2__ 118 1.1.1.2 christos it pl 119 1.1.1.2 christos #endif 120 1.1 christos eorpl r12,r12,r8 121 1.1 christos eor r7,r11,r7,lsr#4 122 1.1.1.2 christos #ifdef __thumb2__ 123 1.1.1.2 christos itt pl 124 1.1.1.2 christos #endif 125 1.1 christos andpl r14,r12,#0xf0 126 1.1 christos andpl r12,r12,#0x0f 127 1.1 christos eor r7,r7,r9,lsl#16 @ ^= rem_4bit[rem] 128 1.1 christos bpl .Linner 129 1.1 christos 130 1.1 christos ldr r3,[sp,#32] @ re-load r3/end 131 1.1 christos add r2,r2,#16 132 1.1 christos mov r14,r4 133 1.1 christos #if __ARM_ARCH__>=7 && defined(__ARMEL__) 134 1.1 christos rev r4,r4 135 1.1 christos str r4,[r0,#12] 136 1.1 christos #elif defined(__ARMEB__) 137 1.1 christos str r4,[r0,#12] 138 1.1 christos #else 139 1.1 christos mov r9,r4,lsr#8 140 1.1 christos strb r4,[r0,#12+3] 141 1.1 christos mov r10,r4,lsr#16 142 1.1 christos strb r9,[r0,#12+2] 143 1.1 christos mov r11,r4,lsr#24 144 1.1 christos strb r10,[r0,#12+1] 145 1.1 christos strb r11,[r0,#12] 146 1.1 christos #endif 147 1.1 christos cmp r2,r3 148 1.1 christos #if __ARM_ARCH__>=7 && defined(__ARMEL__) 149 1.1 christos rev r5,r5 150 1.1 christos str r5,[r0,#8] 151 1.1 christos #elif defined(__ARMEB__) 152 1.1 christos str r5,[r0,#8] 153 1.1 christos #else 154 1.1 christos mov r9,r5,lsr#8 155 1.1 christos strb r5,[r0,#8+3] 156 1.1 christos mov r10,r5,lsr#16 157 1.1 christos strb r9,[r0,#8+2] 158 1.1 christos mov r11,r5,lsr#24 159 1.1 christos strb r10,[r0,#8+1] 160 1.1 christos strb r11,[r0,#8] 161 1.1 christos #endif 162 1.1.1.2 christos 163 1.1.1.2 christos #ifdef __thumb2__ 164 1.1.1.2 christos it ne 165 1.1.1.2 christos #endif 166 1.1.1.2 christos ldrneb r12,[r2,#15] 167 1.1 christos #if __ARM_ARCH__>=7 && defined(__ARMEL__) 168 1.1 christos rev r6,r6 169 1.1 christos str r6,[r0,#4] 170 1.1 christos #elif defined(__ARMEB__) 171 1.1 christos str r6,[r0,#4] 172 1.1 christos #else 173 1.1 christos mov r9,r6,lsr#8 174 1.1 christos strb r6,[r0,#4+3] 175 1.1 christos mov r10,r6,lsr#16 176 1.1 christos strb r9,[r0,#4+2] 177 1.1 christos mov r11,r6,lsr#24 178 1.1 christos strb r10,[r0,#4+1] 179 1.1 christos strb r11,[r0,#4] 180 1.1 christos #endif 181 1.1.1.2 christos 182 1.1 christos #if __ARM_ARCH__>=7 && defined(__ARMEL__) 183 1.1 christos rev r7,r7 184 1.1 christos str r7,[r0,#0] 185 1.1 christos #elif defined(__ARMEB__) 186 1.1 christos str r7,[r0,#0] 187 1.1 christos #else 188 1.1 christos mov r9,r7,lsr#8 189 1.1 christos strb r7,[r0,#0+3] 190 1.1 christos mov r10,r7,lsr#16 191 1.1 christos strb r9,[r0,#0+2] 192 1.1 christos mov r11,r7,lsr#24 193 1.1 christos strb r10,[r0,#0+1] 194 1.1 christos strb r11,[r0,#0] 195 1.1 christos #endif 196 1.1.1.2 christos 197 1.1 christos bne .Louter 198 1.1 christos 199 1.1 christos add sp,sp,#36 200 1.1 christos #if __ARM_ARCH__>=5 201 1.1.1.2 christos ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} 202 1.1 christos #else 203 1.1.1.2 christos ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} 204 1.1 christos tst lr,#1 205 1.1 christos moveq pc,lr @ be binary compatible with V4, yet 206 1.1.1.2 christos .word 0xe12fff1e @ interoperable with Thumb ISA:-) 207 1.1 christos #endif 208 1.1 christos .size gcm_ghash_4bit,.-gcm_ghash_4bit 209 1.1 christos 210 1.1.1.2 christos .globl gcm_gmult_4bit 211 1.1 christos .type gcm_gmult_4bit,%function 212 1.1 christos gcm_gmult_4bit: 213 1.1.1.2 christos stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} 214 1.1 christos ldrb r12,[r0,#15] 215 1.1 christos b rem_4bit_get 216 1.1 christos .Lrem_4bit_got: 217 1.1 christos and r14,r12,#0xf0 218 1.1 christos and r12,r12,#0x0f 219 1.1 christos mov r3,#14 220 1.1 christos 221 1.1 christos add r7,r1,r12,lsl#4 222 1.1.1.2 christos ldmia r7,{r4,r5,r6,r7} @ load Htbl[nlo] 223 1.1 christos ldrb r12,[r0,#14] 224 1.1 christos 225 1.1 christos add r11,r1,r14 226 1.1 christos and r14,r4,#0xf @ rem 227 1.1.1.2 christos ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi] 228 1.1 christos add r14,r14,r14 229 1.1 christos eor r4,r8,r4,lsr#4 230 1.1 christos ldrh r8,[r2,r14] @ rem_4bit[rem] 231 1.1 christos eor r4,r4,r5,lsl#28 232 1.1 christos eor r5,r9,r5,lsr#4 233 1.1 christos eor r5,r5,r6,lsl#28 234 1.1 christos eor r6,r10,r6,lsr#4 235 1.1 christos eor r6,r6,r7,lsl#28 236 1.1 christos eor r7,r11,r7,lsr#4 237 1.1 christos and r14,r12,#0xf0 238 1.1 christos eor r7,r7,r8,lsl#16 239 1.1 christos and r12,r12,#0x0f 240 1.1 christos 241 1.1 christos .Loop: 242 1.1 christos add r11,r1,r12,lsl#4 243 1.1 christos and r12,r4,#0xf @ rem 244 1.1 christos subs r3,r3,#1 245 1.1 christos add r12,r12,r12 246 1.1.1.2 christos ldmia r11,{r8,r9,r10,r11} @ load Htbl[nlo] 247 1.1 christos eor r4,r8,r4,lsr#4 248 1.1 christos eor r4,r4,r5,lsl#28 249 1.1 christos eor r5,r9,r5,lsr#4 250 1.1 christos eor r5,r5,r6,lsl#28 251 1.1 christos ldrh r8,[r2,r12] @ rem_4bit[rem] 252 1.1 christos eor r6,r10,r6,lsr#4 253 1.1.1.2 christos #ifdef __thumb2__ 254 1.1.1.2 christos it pl 255 1.1.1.2 christos #endif 256 1.1.1.2 christos ldrplb r12,[r0,r3] 257 1.1 christos eor r6,r6,r7,lsl#28 258 1.1 christos eor r7,r11,r7,lsr#4 259 1.1 christos 260 1.1 christos add r11,r1,r14 261 1.1 christos and r14,r4,#0xf @ rem 262 1.1 christos eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] 263 1.1 christos add r14,r14,r14 264 1.1.1.2 christos ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi] 265 1.1 christos eor r4,r8,r4,lsr#4 266 1.1 christos eor r4,r4,r5,lsl#28 267 1.1 christos eor r5,r9,r5,lsr#4 268 1.1 christos ldrh r8,[r2,r14] @ rem_4bit[rem] 269 1.1 christos eor r5,r5,r6,lsl#28 270 1.1 christos eor r6,r10,r6,lsr#4 271 1.1 christos eor r6,r6,r7,lsl#28 272 1.1 christos eor r7,r11,r7,lsr#4 273 1.1.1.2 christos #ifdef __thumb2__ 274 1.1.1.2 christos itt pl 275 1.1.1.2 christos #endif 276 1.1 christos andpl r14,r12,#0xf0 277 1.1 christos andpl r12,r12,#0x0f 278 1.1 christos eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] 279 1.1 christos bpl .Loop 280 1.1 christos #if __ARM_ARCH__>=7 && defined(__ARMEL__) 281 1.1 christos rev r4,r4 282 1.1 christos str r4,[r0,#12] 283 1.1 christos #elif defined(__ARMEB__) 284 1.1 christos str r4,[r0,#12] 285 1.1 christos #else 286 1.1 christos mov r9,r4,lsr#8 287 1.1 christos strb r4,[r0,#12+3] 288 1.1 christos mov r10,r4,lsr#16 289 1.1 christos strb r9,[r0,#12+2] 290 1.1 christos mov r11,r4,lsr#24 291 1.1 christos strb r10,[r0,#12+1] 292 1.1 christos strb r11,[r0,#12] 293 1.1 christos #endif 294 1.1.1.2 christos 295 1.1 christos #if __ARM_ARCH__>=7 && defined(__ARMEL__) 296 1.1 christos rev r5,r5 297 1.1 christos str r5,[r0,#8] 298 1.1 christos #elif defined(__ARMEB__) 299 1.1 christos str r5,[r0,#8] 300 1.1 christos #else 301 1.1 christos mov r9,r5,lsr#8 302 1.1 christos strb r5,[r0,#8+3] 303 1.1 christos mov r10,r5,lsr#16 304 1.1 christos strb r9,[r0,#8+2] 305 1.1 christos mov r11,r5,lsr#24 306 1.1 christos strb r10,[r0,#8+1] 307 1.1 christos strb r11,[r0,#8] 308 1.1 christos #endif 309 1.1.1.2 christos 310 1.1 christos #if __ARM_ARCH__>=7 && defined(__ARMEL__) 311 1.1 christos rev r6,r6 312 1.1 christos str r6,[r0,#4] 313 1.1 christos #elif defined(__ARMEB__) 314 1.1 christos str r6,[r0,#4] 315 1.1 christos #else 316 1.1 christos mov r9,r6,lsr#8 317 1.1 christos strb r6,[r0,#4+3] 318 1.1 christos mov r10,r6,lsr#16 319 1.1 christos strb r9,[r0,#4+2] 320 1.1 christos mov r11,r6,lsr#24 321 1.1 christos strb r10,[r0,#4+1] 322 1.1 christos strb r11,[r0,#4] 323 1.1 christos #endif 324 1.1.1.2 christos 325 1.1 christos #if __ARM_ARCH__>=7 && defined(__ARMEL__) 326 1.1 christos rev r7,r7 327 1.1 christos str r7,[r0,#0] 328 1.1 christos #elif defined(__ARMEB__) 329 1.1 christos str r7,[r0,#0] 330 1.1 christos #else 331 1.1 christos mov r9,r7,lsr#8 332 1.1 christos strb r7,[r0,#0+3] 333 1.1 christos mov r10,r7,lsr#16 334 1.1 christos strb r9,[r0,#0+2] 335 1.1 christos mov r11,r7,lsr#24 336 1.1 christos strb r10,[r0,#0+1] 337 1.1 christos strb r11,[r0,#0] 338 1.1 christos #endif 339 1.1.1.2 christos 340 1.1 christos #if __ARM_ARCH__>=5 341 1.1.1.2 christos ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} 342 1.1 christos #else 343 1.1.1.2 christos ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} 344 1.1 christos tst lr,#1 345 1.1 christos moveq pc,lr @ be binary compatible with V4, yet 346 1.1.1.2 christos .word 0xe12fff1e @ interoperable with Thumb ISA:-) 347 1.1 christos #endif 348 1.1 christos .size gcm_gmult_4bit,.-gcm_gmult_4bit 349 1.1 christos #if __ARM_MAX_ARCH__>=7 350 1.1 christos .arch armv7-a 351 1.1 christos .fpu neon 352 1.1 christos 353 1.1.1.2 christos .globl gcm_init_neon 354 1.1 christos .type gcm_init_neon,%function 355 1.1 christos .align 4 356 1.1 christos gcm_init_neon: 357 1.1.1.2 christos vld1.64 d7,[r1]! @ load H 358 1.1.1.2 christos vmov.i8 q8,#0xe1 359 1.1.1.2 christos vld1.64 d6,[r1] 360 1.1 christos vshl.i64 d17,#57 361 1.1 christos vshr.u64 d16,#63 @ t0=0xc2....01 362 1.1.1.2 christos vdup.8 q9,d7[7] 363 1.1 christos vshr.u64 d26,d6,#63 364 1.1.1.2 christos vshr.s8 q9,#7 @ broadcast carry bit 365 1.1 christos vshl.i64 q3,q3,#1 366 1.1.1.2 christos vand q8,q8,q9 367 1.1.1.2 christos vorr d7,d26 @ H<<<=1 368 1.1.1.2 christos veor q3,q3,q8 @ twisted H 369 1.1.1.2 christos vstmia r0,{q3} 370 1.1 christos 371 1.1 christos RET @ bx lr 372 1.1 christos .size gcm_init_neon,.-gcm_init_neon 373 1.1 christos 374 1.1.1.2 christos .globl gcm_gmult_neon 375 1.1 christos .type gcm_gmult_neon,%function 376 1.1 christos .align 4 377 1.1 christos gcm_gmult_neon: 378 1.1.1.2 christos vld1.64 d7,[r0]! @ load Xi 379 1.1.1.2 christos vld1.64 d6,[r0]! 380 1.1 christos vmov.i64 d29,#0x0000ffffffffffff 381 1.1.1.2 christos vldmia r1,{d26,d27} @ load twisted H 382 1.1 christos vmov.i64 d30,#0x00000000ffffffff 383 1.1 christos #ifdef __ARMEL__ 384 1.1 christos vrev64.8 q3,q3 385 1.1 christos #endif 386 1.1 christos vmov.i64 d31,#0x000000000000ffff 387 1.1.1.2 christos veor d28,d26,d27 @ Karatsuba pre-processing 388 1.1.1.2 christos mov r3,#16 389 1.1.1.2 christos b .Lgmult_neon 390 1.1 christos .size gcm_gmult_neon,.-gcm_gmult_neon 391 1.1 christos 392 1.1.1.2 christos .globl gcm_ghash_neon 393 1.1 christos .type gcm_ghash_neon,%function 394 1.1 christos .align 4 395 1.1 christos gcm_ghash_neon: 396 1.1.1.2 christos vld1.64 d1,[r0]! @ load Xi 397 1.1.1.2 christos vld1.64 d0,[r0]! 398 1.1 christos vmov.i64 d29,#0x0000ffffffffffff 399 1.1.1.2 christos vldmia r1,{d26,d27} @ load twisted H 400 1.1 christos vmov.i64 d30,#0x00000000ffffffff 401 1.1 christos #ifdef __ARMEL__ 402 1.1 christos vrev64.8 q0,q0 403 1.1 christos #endif 404 1.1 christos vmov.i64 d31,#0x000000000000ffff 405 1.1.1.2 christos veor d28,d26,d27 @ Karatsuba pre-processing 406 1.1 christos 407 1.1 christos .Loop_neon: 408 1.1.1.2 christos vld1.64 d7,[r2]! @ load inp 409 1.1.1.2 christos vld1.64 d6,[r2]! 410 1.1 christos #ifdef __ARMEL__ 411 1.1 christos vrev64.8 q3,q3 412 1.1 christos #endif 413 1.1.1.2 christos veor q3,q0 @ inp^=Xi 414 1.1 christos .Lgmult_neon: 415 1.1.1.2 christos vext.8 d16, d26, d26, #1 @ A1 416 1.1 christos vmull.p8 q8, d16, d6 @ F = A1*B 417 1.1.1.2 christos vext.8 d0, d6, d6, #1 @ B1 418 1.1 christos vmull.p8 q0, d26, d0 @ E = A*B1 419 1.1.1.2 christos vext.8 d18, d26, d26, #2 @ A2 420 1.1 christos vmull.p8 q9, d18, d6 @ H = A2*B 421 1.1.1.2 christos vext.8 d22, d6, d6, #2 @ B2 422 1.1 christos vmull.p8 q11, d26, d22 @ G = A*B2 423 1.1.1.2 christos vext.8 d20, d26, d26, #3 @ A3 424 1.1.1.2 christos veor q8, q8, q0 @ L = E + F 425 1.1 christos vmull.p8 q10, d20, d6 @ J = A3*B 426 1.1.1.2 christos vext.8 d0, d6, d6, #3 @ B3 427 1.1.1.2 christos veor q9, q9, q11 @ M = G + H 428 1.1 christos vmull.p8 q0, d26, d0 @ I = A*B3 429 1.1.1.2 christos veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 430 1.1.1.2 christos vand d17, d17, d29 431 1.1.1.2 christos vext.8 d22, d6, d6, #4 @ B4 432 1.1.1.2 christos veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 433 1.1.1.2 christos vand d19, d19, d30 434 1.1 christos vmull.p8 q11, d26, d22 @ K = A*B4 435 1.1.1.2 christos veor q10, q10, q0 @ N = I + J 436 1.1.1.2 christos veor d16, d16, d17 437 1.1.1.2 christos veor d18, d18, d19 438 1.1.1.2 christos veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 439 1.1.1.2 christos vand d21, d21, d31 440 1.1.1.2 christos vext.8 q8, q8, q8, #15 441 1.1.1.2 christos veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 442 1.1 christos vmov.i64 d23, #0 443 1.1.1.2 christos vext.8 q9, q9, q9, #14 444 1.1.1.2 christos veor d20, d20, d21 445 1.1 christos vmull.p8 q0, d26, d6 @ D = A*B 446 1.1.1.2 christos vext.8 q11, q11, q11, #12 447 1.1.1.2 christos vext.8 q10, q10, q10, #13 448 1.1.1.2 christos veor q8, q8, q9 449 1.1.1.2 christos veor q10, q10, q11 450 1.1.1.2 christos veor q0, q0, q8 451 1.1.1.2 christos veor q0, q0, q10 452 1.1.1.2 christos veor d6,d6,d7 @ Karatsuba pre-processing 453 1.1.1.2 christos vext.8 d16, d28, d28, #1 @ A1 454 1.1 christos vmull.p8 q8, d16, d6 @ F = A1*B 455 1.1.1.2 christos vext.8 d2, d6, d6, #1 @ B1 456 1.1 christos vmull.p8 q1, d28, d2 @ E = A*B1 457 1.1.1.2 christos vext.8 d18, d28, d28, #2 @ A2 458 1.1 christos vmull.p8 q9, d18, d6 @ H = A2*B 459 1.1.1.2 christos vext.8 d22, d6, d6, #2 @ B2 460 1.1 christos vmull.p8 q11, d28, d22 @ G = A*B2 461 1.1.1.2 christos vext.8 d20, d28, d28, #3 @ A3 462 1.1.1.2 christos veor q8, q8, q1 @ L = E + F 463 1.1 christos vmull.p8 q10, d20, d6 @ J = A3*B 464 1.1.1.2 christos vext.8 d2, d6, d6, #3 @ B3 465 1.1.1.2 christos veor q9, q9, q11 @ M = G + H 466 1.1 christos vmull.p8 q1, d28, d2 @ I = A*B3 467 1.1.1.2 christos veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 468 1.1.1.2 christos vand d17, d17, d29 469 1.1.1.2 christos vext.8 d22, d6, d6, #4 @ B4 470 1.1.1.2 christos veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 471 1.1.1.2 christos vand d19, d19, d30 472 1.1 christos vmull.p8 q11, d28, d22 @ K = A*B4 473 1.1.1.2 christos veor q10, q10, q1 @ N = I + J 474 1.1.1.2 christos veor d16, d16, d17 475 1.1.1.2 christos veor d18, d18, d19 476 1.1.1.2 christos veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 477 1.1.1.2 christos vand d21, d21, d31 478 1.1.1.2 christos vext.8 q8, q8, q8, #15 479 1.1.1.2 christos veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 480 1.1 christos vmov.i64 d23, #0 481 1.1.1.2 christos vext.8 q9, q9, q9, #14 482 1.1.1.2 christos veor d20, d20, d21 483 1.1 christos vmull.p8 q1, d28, d6 @ D = A*B 484 1.1.1.2 christos vext.8 q11, q11, q11, #12 485 1.1.1.2 christos vext.8 q10, q10, q10, #13 486 1.1.1.2 christos veor q8, q8, q9 487 1.1.1.2 christos veor q10, q10, q11 488 1.1.1.2 christos veor q1, q1, q8 489 1.1.1.2 christos veor q1, q1, q10 490 1.1.1.2 christos vext.8 d16, d27, d27, #1 @ A1 491 1.1 christos vmull.p8 q8, d16, d7 @ F = A1*B 492 1.1.1.2 christos vext.8 d4, d7, d7, #1 @ B1 493 1.1 christos vmull.p8 q2, d27, d4 @ E = A*B1 494 1.1.1.2 christos vext.8 d18, d27, d27, #2 @ A2 495 1.1 christos vmull.p8 q9, d18, d7 @ H = A2*B 496 1.1.1.2 christos vext.8 d22, d7, d7, #2 @ B2 497 1.1 christos vmull.p8 q11, d27, d22 @ G = A*B2 498 1.1.1.2 christos vext.8 d20, d27, d27, #3 @ A3 499 1.1.1.2 christos veor q8, q8, q2 @ L = E + F 500 1.1 christos vmull.p8 q10, d20, d7 @ J = A3*B 501 1.1.1.2 christos vext.8 d4, d7, d7, #3 @ B3 502 1.1.1.2 christos veor q9, q9, q11 @ M = G + H 503 1.1 christos vmull.p8 q2, d27, d4 @ I = A*B3 504 1.1.1.2 christos veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 505 1.1.1.2 christos vand d17, d17, d29 506 1.1.1.2 christos vext.8 d22, d7, d7, #4 @ B4 507 1.1.1.2 christos veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 508 1.1.1.2 christos vand d19, d19, d30 509 1.1 christos vmull.p8 q11, d27, d22 @ K = A*B4 510 1.1.1.2 christos veor q10, q10, q2 @ N = I + J 511 1.1.1.2 christos veor d16, d16, d17 512 1.1.1.2 christos veor d18, d18, d19 513 1.1.1.2 christos veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 514 1.1.1.2 christos vand d21, d21, d31 515 1.1.1.2 christos vext.8 q8, q8, q8, #15 516 1.1.1.2 christos veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 517 1.1 christos vmov.i64 d23, #0 518 1.1.1.2 christos vext.8 q9, q9, q9, #14 519 1.1.1.2 christos veor d20, d20, d21 520 1.1 christos vmull.p8 q2, d27, d7 @ D = A*B 521 1.1.1.2 christos vext.8 q11, q11, q11, #12 522 1.1.1.2 christos vext.8 q10, q10, q10, #13 523 1.1.1.2 christos veor q8, q8, q9 524 1.1.1.2 christos veor q10, q10, q11 525 1.1.1.2 christos veor q2, q2, q8 526 1.1.1.2 christos veor q2, q2, q10 527 1.1.1.2 christos veor q1,q1,q0 @ Karatsuba post-processing 528 1.1.1.2 christos veor q1,q1,q2 529 1.1.1.2 christos veor d1,d1,d2 530 1.1.1.2 christos veor d4,d4,d3 @ Xh|Xl - 256-bit result 531 1.1 christos 532 1.1 christos @ equivalent of reduction_avx from ghash-x86_64.pl 533 1.1 christos vshl.i64 q9,q0,#57 @ 1st phase 534 1.1 christos vshl.i64 q10,q0,#62 535 1.1.1.2 christos veor q10,q10,q9 @ 536 1.1 christos vshl.i64 q9,q0,#63 537 1.1.1.2 christos veor q10, q10, q9 @ 538 1.1.1.2 christos veor d1,d1,d20 @ 539 1.1.1.2 christos veor d4,d4,d21 540 1.1 christos 541 1.1 christos vshr.u64 q10,q0,#1 @ 2nd phase 542 1.1.1.2 christos veor q2,q2,q0 543 1.1.1.2 christos veor q0,q0,q10 @ 544 1.1 christos vshr.u64 q10,q10,#6 545 1.1 christos vshr.u64 q0,q0,#1 @ 546 1.1.1.2 christos veor q0,q0,q2 @ 547 1.1.1.2 christos veor q0,q0,q10 @ 548 1.1 christos 549 1.1.1.2 christos subs r3,#16 550 1.1.1.2 christos bne .Loop_neon 551 1.1 christos 552 1.1 christos #ifdef __ARMEL__ 553 1.1 christos vrev64.8 q0,q0 554 1.1 christos #endif 555 1.1.1.2 christos sub r0,#16 556 1.1.1.2 christos vst1.64 d1,[r0]! @ write out Xi 557 1.1.1.2 christos vst1.64 d0,[r0] 558 1.1 christos 559 1.1 christos RET @ bx lr 560 1.1 christos .size gcm_ghash_neon,.-gcm_ghash_neon 561 1.1 christos #endif 562 1.1.1.2 christos .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 563 1.1.1.2 christos .align 2 564 1.1.1.2 christos .align 2 565