Home | History | Annotate | Line # | Download | only in arm
      1      1.1  christos #include "arm_asm.h"
      2  1.1.1.2  christos #include "arm_arch.h"
      3      1.1  christos 
      4      1.1  christos .text
      5  1.1.1.2  christos #if defined(__thumb2__) || defined(__clang__)
      6  1.1.1.2  christos .syntax	unified
      7  1.1.1.2  christos #define ldrplb  ldrbpl
      8  1.1.1.2  christos #define ldrneb  ldrbne
      9  1.1.1.2  christos #endif
     10  1.1.1.2  christos #if defined(__thumb2__)
     11  1.1.1.2  christos .thumb
     12  1.1.1.2  christos #else
     13      1.1  christos .code	32
     14  1.1.1.2  christos #endif
     15      1.1  christos 
     16      1.1  christos .type	rem_4bit,%object
     17      1.1  christos .align	5
     18      1.1  christos rem_4bit:
     19      1.1  christos .short	0x0000,0x1C20,0x3840,0x2460
     20      1.1  christos .short	0x7080,0x6CA0,0x48C0,0x54E0
     21      1.1  christos .short	0xE100,0xFD20,0xD940,0xC560
     22      1.1  christos .short	0x9180,0x8DA0,0xA9C0,0xB5E0
     23      1.1  christos .size	rem_4bit,.-rem_4bit
     24      1.1  christos 
     25      1.1  christos .type	rem_4bit_get,%function
     26      1.1  christos rem_4bit_get:
     27  1.1.1.2  christos #if defined(__thumb2__)
     28  1.1.1.2  christos 	adr	r2,rem_4bit
     29  1.1.1.2  christos #else
     30  1.1.1.2  christos 	sub	r2,pc,#8+32	@ &rem_4bit
     31  1.1.1.2  christos #endif
     32      1.1  christos 	b	.Lrem_4bit_got
     33      1.1  christos 	nop
     34  1.1.1.2  christos 	nop
     35      1.1  christos .size	rem_4bit_get,.-rem_4bit_get
     36      1.1  christos 
     37  1.1.1.2  christos .globl	gcm_ghash_4bit
     38      1.1  christos .type	gcm_ghash_4bit,%function
     39  1.1.1.2  christos .align	4
     40      1.1  christos gcm_ghash_4bit:
     41  1.1.1.2  christos #if defined(__thumb2__)
     42  1.1.1.2  christos 	adr	r12,rem_4bit
     43  1.1.1.2  christos #else
     44  1.1.1.2  christos 	sub	r12,pc,#8+48		@ &rem_4bit
     45  1.1.1.2  christos #endif
     46      1.1  christos 	add	r3,r2,r3		@ r3 to point at the end
     47  1.1.1.2  christos 	stmdb	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}		@ save r3/end too
     48      1.1  christos 
     49  1.1.1.2  christos 	ldmia	r12,{r4,r5,r6,r7,r8,r9,r10,r11}		@ copy rem_4bit ...
     50  1.1.1.2  christos 	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}		@ ... to stack
     51      1.1  christos 
     52      1.1  christos 	ldrb	r12,[r2,#15]
     53      1.1  christos 	ldrb	r14,[r0,#15]
     54      1.1  christos .Louter:
     55      1.1  christos 	eor	r12,r12,r14
     56      1.1  christos 	and	r14,r12,#0xf0
     57      1.1  christos 	and	r12,r12,#0x0f
     58      1.1  christos 	mov	r3,#14
     59      1.1  christos 
     60      1.1  christos 	add	r7,r1,r12,lsl#4
     61  1.1.1.2  christos 	ldmia	r7,{r4,r5,r6,r7}	@ load Htbl[nlo]
     62      1.1  christos 	add	r11,r1,r14
     63      1.1  christos 	ldrb	r12,[r2,#14]
     64      1.1  christos 
     65      1.1  christos 	and	r14,r4,#0xf		@ rem
     66  1.1.1.2  christos 	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
     67      1.1  christos 	add	r14,r14,r14
     68      1.1  christos 	eor	r4,r8,r4,lsr#4
     69      1.1  christos 	ldrh	r8,[sp,r14]		@ rem_4bit[rem]
     70      1.1  christos 	eor	r4,r4,r5,lsl#28
     71      1.1  christos 	ldrb	r14,[r0,#14]
     72      1.1  christos 	eor	r5,r9,r5,lsr#4
     73      1.1  christos 	eor	r5,r5,r6,lsl#28
     74      1.1  christos 	eor	r6,r10,r6,lsr#4
     75      1.1  christos 	eor	r6,r6,r7,lsl#28
     76      1.1  christos 	eor	r7,r11,r7,lsr#4
     77      1.1  christos 	eor	r12,r12,r14
     78      1.1  christos 	and	r14,r12,#0xf0
     79      1.1  christos 	and	r12,r12,#0x0f
     80      1.1  christos 	eor	r7,r7,r8,lsl#16
     81      1.1  christos 
     82      1.1  christos .Linner:
     83      1.1  christos 	add	r11,r1,r12,lsl#4
     84      1.1  christos 	and	r12,r4,#0xf		@ rem
     85      1.1  christos 	subs	r3,r3,#1
     86      1.1  christos 	add	r12,r12,r12
     87  1.1.1.2  christos 	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nlo]
     88      1.1  christos 	eor	r4,r8,r4,lsr#4
     89      1.1  christos 	eor	r4,r4,r5,lsl#28
     90      1.1  christos 	eor	r5,r9,r5,lsr#4
     91      1.1  christos 	eor	r5,r5,r6,lsl#28
     92      1.1  christos 	ldrh	r8,[sp,r12]		@ rem_4bit[rem]
     93      1.1  christos 	eor	r6,r10,r6,lsr#4
     94  1.1.1.2  christos #ifdef	__thumb2__
     95  1.1.1.2  christos 	it	pl
     96  1.1.1.2  christos #endif
     97  1.1.1.2  christos 	ldrplb	r12,[r2,r3]
     98      1.1  christos 	eor	r6,r6,r7,lsl#28
     99      1.1  christos 	eor	r7,r11,r7,lsr#4
    100      1.1  christos 
    101      1.1  christos 	add	r11,r1,r14
    102      1.1  christos 	and	r14,r4,#0xf		@ rem
    103      1.1  christos 	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
    104      1.1  christos 	add	r14,r14,r14
    105  1.1.1.2  christos 	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
    106      1.1  christos 	eor	r4,r8,r4,lsr#4
    107  1.1.1.2  christos #ifdef	__thumb2__
    108  1.1.1.2  christos 	it	pl
    109  1.1.1.2  christos #endif
    110  1.1.1.2  christos 	ldrplb	r8,[r0,r3]
    111      1.1  christos 	eor	r4,r4,r5,lsl#28
    112      1.1  christos 	eor	r5,r9,r5,lsr#4
    113      1.1  christos 	ldrh	r9,[sp,r14]
    114      1.1  christos 	eor	r5,r5,r6,lsl#28
    115      1.1  christos 	eor	r6,r10,r6,lsr#4
    116      1.1  christos 	eor	r6,r6,r7,lsl#28
    117  1.1.1.2  christos #ifdef	__thumb2__
    118  1.1.1.2  christos 	it	pl
    119  1.1.1.2  christos #endif
    120      1.1  christos 	eorpl	r12,r12,r8
    121      1.1  christos 	eor	r7,r11,r7,lsr#4
    122  1.1.1.2  christos #ifdef	__thumb2__
    123  1.1.1.2  christos 	itt	pl
    124  1.1.1.2  christos #endif
    125      1.1  christos 	andpl	r14,r12,#0xf0
    126      1.1  christos 	andpl	r12,r12,#0x0f
    127      1.1  christos 	eor	r7,r7,r9,lsl#16	@ ^= rem_4bit[rem]
    128      1.1  christos 	bpl	.Linner
    129      1.1  christos 
    130      1.1  christos 	ldr	r3,[sp,#32]		@ re-load r3/end
    131      1.1  christos 	add	r2,r2,#16
    132      1.1  christos 	mov	r14,r4
    133      1.1  christos #if __ARM_ARCH__>=7 && defined(__ARMEL__)
    134      1.1  christos 	rev	r4,r4
    135      1.1  christos 	str	r4,[r0,#12]
    136      1.1  christos #elif defined(__ARMEB__)
    137      1.1  christos 	str	r4,[r0,#12]
    138      1.1  christos #else
    139      1.1  christos 	mov	r9,r4,lsr#8
    140      1.1  christos 	strb	r4,[r0,#12+3]
    141      1.1  christos 	mov	r10,r4,lsr#16
    142      1.1  christos 	strb	r9,[r0,#12+2]
    143      1.1  christos 	mov	r11,r4,lsr#24
    144      1.1  christos 	strb	r10,[r0,#12+1]
    145      1.1  christos 	strb	r11,[r0,#12]
    146      1.1  christos #endif
    147      1.1  christos 	cmp	r2,r3
    148      1.1  christos #if __ARM_ARCH__>=7 && defined(__ARMEL__)
    149      1.1  christos 	rev	r5,r5
    150      1.1  christos 	str	r5,[r0,#8]
    151      1.1  christos #elif defined(__ARMEB__)
    152      1.1  christos 	str	r5,[r0,#8]
    153      1.1  christos #else
    154      1.1  christos 	mov	r9,r5,lsr#8
    155      1.1  christos 	strb	r5,[r0,#8+3]
    156      1.1  christos 	mov	r10,r5,lsr#16
    157      1.1  christos 	strb	r9,[r0,#8+2]
    158      1.1  christos 	mov	r11,r5,lsr#24
    159      1.1  christos 	strb	r10,[r0,#8+1]
    160      1.1  christos 	strb	r11,[r0,#8]
    161      1.1  christos #endif
    162  1.1.1.2  christos 
    163  1.1.1.2  christos #ifdef __thumb2__
    164  1.1.1.2  christos 	it	ne
    165  1.1.1.2  christos #endif
    166  1.1.1.2  christos 	ldrneb	r12,[r2,#15]
    167      1.1  christos #if __ARM_ARCH__>=7 && defined(__ARMEL__)
    168      1.1  christos 	rev	r6,r6
    169      1.1  christos 	str	r6,[r0,#4]
    170      1.1  christos #elif defined(__ARMEB__)
    171      1.1  christos 	str	r6,[r0,#4]
    172      1.1  christos #else
    173      1.1  christos 	mov	r9,r6,lsr#8
    174      1.1  christos 	strb	r6,[r0,#4+3]
    175      1.1  christos 	mov	r10,r6,lsr#16
    176      1.1  christos 	strb	r9,[r0,#4+2]
    177      1.1  christos 	mov	r11,r6,lsr#24
    178      1.1  christos 	strb	r10,[r0,#4+1]
    179      1.1  christos 	strb	r11,[r0,#4]
    180      1.1  christos #endif
    181  1.1.1.2  christos 
    182      1.1  christos #if __ARM_ARCH__>=7 && defined(__ARMEL__)
    183      1.1  christos 	rev	r7,r7
    184      1.1  christos 	str	r7,[r0,#0]
    185      1.1  christos #elif defined(__ARMEB__)
    186      1.1  christos 	str	r7,[r0,#0]
    187      1.1  christos #else
    188      1.1  christos 	mov	r9,r7,lsr#8
    189      1.1  christos 	strb	r7,[r0,#0+3]
    190      1.1  christos 	mov	r10,r7,lsr#16
    191      1.1  christos 	strb	r9,[r0,#0+2]
    192      1.1  christos 	mov	r11,r7,lsr#24
    193      1.1  christos 	strb	r10,[r0,#0+1]
    194      1.1  christos 	strb	r11,[r0,#0]
    195      1.1  christos #endif
    196  1.1.1.2  christos 
    197      1.1  christos 	bne	.Louter
    198      1.1  christos 
    199      1.1  christos 	add	sp,sp,#36
    200      1.1  christos #if __ARM_ARCH__>=5
    201  1.1.1.2  christos 	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
    202      1.1  christos #else
    203  1.1.1.2  christos 	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
    204      1.1  christos 	tst	lr,#1
    205      1.1  christos 	moveq	pc,lr			@ be binary compatible with V4, yet
    206  1.1.1.2  christos .word	0xe12fff1e			@ interoperable with Thumb ISA:-)
    207      1.1  christos #endif
    208      1.1  christos .size	gcm_ghash_4bit,.-gcm_ghash_4bit
    209      1.1  christos 
    210  1.1.1.2  christos .globl	gcm_gmult_4bit
    211      1.1  christos .type	gcm_gmult_4bit,%function
    212      1.1  christos gcm_gmult_4bit:
    213  1.1.1.2  christos 	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
    214      1.1  christos 	ldrb	r12,[r0,#15]
    215      1.1  christos 	b	rem_4bit_get
    216      1.1  christos .Lrem_4bit_got:
    217      1.1  christos 	and	r14,r12,#0xf0
    218      1.1  christos 	and	r12,r12,#0x0f
    219      1.1  christos 	mov	r3,#14
    220      1.1  christos 
    221      1.1  christos 	add	r7,r1,r12,lsl#4
    222  1.1.1.2  christos 	ldmia	r7,{r4,r5,r6,r7}	@ load Htbl[nlo]
    223      1.1  christos 	ldrb	r12,[r0,#14]
    224      1.1  christos 
    225      1.1  christos 	add	r11,r1,r14
    226      1.1  christos 	and	r14,r4,#0xf		@ rem
    227  1.1.1.2  christos 	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
    228      1.1  christos 	add	r14,r14,r14
    229      1.1  christos 	eor	r4,r8,r4,lsr#4
    230      1.1  christos 	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
    231      1.1  christos 	eor	r4,r4,r5,lsl#28
    232      1.1  christos 	eor	r5,r9,r5,lsr#4
    233      1.1  christos 	eor	r5,r5,r6,lsl#28
    234      1.1  christos 	eor	r6,r10,r6,lsr#4
    235      1.1  christos 	eor	r6,r6,r7,lsl#28
    236      1.1  christos 	eor	r7,r11,r7,lsr#4
    237      1.1  christos 	and	r14,r12,#0xf0
    238      1.1  christos 	eor	r7,r7,r8,lsl#16
    239      1.1  christos 	and	r12,r12,#0x0f
    240      1.1  christos 
    241      1.1  christos .Loop:
    242      1.1  christos 	add	r11,r1,r12,lsl#4
    243      1.1  christos 	and	r12,r4,#0xf		@ rem
    244      1.1  christos 	subs	r3,r3,#1
    245      1.1  christos 	add	r12,r12,r12
    246  1.1.1.2  christos 	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nlo]
    247      1.1  christos 	eor	r4,r8,r4,lsr#4
    248      1.1  christos 	eor	r4,r4,r5,lsl#28
    249      1.1  christos 	eor	r5,r9,r5,lsr#4
    250      1.1  christos 	eor	r5,r5,r6,lsl#28
    251      1.1  christos 	ldrh	r8,[r2,r12]	@ rem_4bit[rem]
    252      1.1  christos 	eor	r6,r10,r6,lsr#4
    253  1.1.1.2  christos #ifdef	__thumb2__
    254  1.1.1.2  christos 	it	pl
    255  1.1.1.2  christos #endif
    256  1.1.1.2  christos 	ldrplb	r12,[r0,r3]
    257      1.1  christos 	eor	r6,r6,r7,lsl#28
    258      1.1  christos 	eor	r7,r11,r7,lsr#4
    259      1.1  christos 
    260      1.1  christos 	add	r11,r1,r14
    261      1.1  christos 	and	r14,r4,#0xf		@ rem
    262      1.1  christos 	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
    263      1.1  christos 	add	r14,r14,r14
    264  1.1.1.2  christos 	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
    265      1.1  christos 	eor	r4,r8,r4,lsr#4
    266      1.1  christos 	eor	r4,r4,r5,lsl#28
    267      1.1  christos 	eor	r5,r9,r5,lsr#4
    268      1.1  christos 	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
    269      1.1  christos 	eor	r5,r5,r6,lsl#28
    270      1.1  christos 	eor	r6,r10,r6,lsr#4
    271      1.1  christos 	eor	r6,r6,r7,lsl#28
    272      1.1  christos 	eor	r7,r11,r7,lsr#4
    273  1.1.1.2  christos #ifdef	__thumb2__
    274  1.1.1.2  christos 	itt	pl
    275  1.1.1.2  christos #endif
    276      1.1  christos 	andpl	r14,r12,#0xf0
    277      1.1  christos 	andpl	r12,r12,#0x0f
    278      1.1  christos 	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
    279      1.1  christos 	bpl	.Loop
    280      1.1  christos #if __ARM_ARCH__>=7 && defined(__ARMEL__)
    281      1.1  christos 	rev	r4,r4
    282      1.1  christos 	str	r4,[r0,#12]
    283      1.1  christos #elif defined(__ARMEB__)
    284      1.1  christos 	str	r4,[r0,#12]
    285      1.1  christos #else
    286      1.1  christos 	mov	r9,r4,lsr#8
    287      1.1  christos 	strb	r4,[r0,#12+3]
    288      1.1  christos 	mov	r10,r4,lsr#16
    289      1.1  christos 	strb	r9,[r0,#12+2]
    290      1.1  christos 	mov	r11,r4,lsr#24
    291      1.1  christos 	strb	r10,[r0,#12+1]
    292      1.1  christos 	strb	r11,[r0,#12]
    293      1.1  christos #endif
    294  1.1.1.2  christos 
    295      1.1  christos #if __ARM_ARCH__>=7 && defined(__ARMEL__)
    296      1.1  christos 	rev	r5,r5
    297      1.1  christos 	str	r5,[r0,#8]
    298      1.1  christos #elif defined(__ARMEB__)
    299      1.1  christos 	str	r5,[r0,#8]
    300      1.1  christos #else
    301      1.1  christos 	mov	r9,r5,lsr#8
    302      1.1  christos 	strb	r5,[r0,#8+3]
    303      1.1  christos 	mov	r10,r5,lsr#16
    304      1.1  christos 	strb	r9,[r0,#8+2]
    305      1.1  christos 	mov	r11,r5,lsr#24
    306      1.1  christos 	strb	r10,[r0,#8+1]
    307      1.1  christos 	strb	r11,[r0,#8]
    308      1.1  christos #endif
    309  1.1.1.2  christos 
    310      1.1  christos #if __ARM_ARCH__>=7 && defined(__ARMEL__)
    311      1.1  christos 	rev	r6,r6
    312      1.1  christos 	str	r6,[r0,#4]
    313      1.1  christos #elif defined(__ARMEB__)
    314      1.1  christos 	str	r6,[r0,#4]
    315      1.1  christos #else
    316      1.1  christos 	mov	r9,r6,lsr#8
    317      1.1  christos 	strb	r6,[r0,#4+3]
    318      1.1  christos 	mov	r10,r6,lsr#16
    319      1.1  christos 	strb	r9,[r0,#4+2]
    320      1.1  christos 	mov	r11,r6,lsr#24
    321      1.1  christos 	strb	r10,[r0,#4+1]
    322      1.1  christos 	strb	r11,[r0,#4]
    323      1.1  christos #endif
    324  1.1.1.2  christos 
    325      1.1  christos #if __ARM_ARCH__>=7 && defined(__ARMEL__)
    326      1.1  christos 	rev	r7,r7
    327      1.1  christos 	str	r7,[r0,#0]
    328      1.1  christos #elif defined(__ARMEB__)
    329      1.1  christos 	str	r7,[r0,#0]
    330      1.1  christos #else
    331      1.1  christos 	mov	r9,r7,lsr#8
    332      1.1  christos 	strb	r7,[r0,#0+3]
    333      1.1  christos 	mov	r10,r7,lsr#16
    334      1.1  christos 	strb	r9,[r0,#0+2]
    335      1.1  christos 	mov	r11,r7,lsr#24
    336      1.1  christos 	strb	r10,[r0,#0+1]
    337      1.1  christos 	strb	r11,[r0,#0]
    338      1.1  christos #endif
    339  1.1.1.2  christos 
    340      1.1  christos #if __ARM_ARCH__>=5
    341  1.1.1.2  christos 	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
    342      1.1  christos #else
    343  1.1.1.2  christos 	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
    344      1.1  christos 	tst	lr,#1
    345      1.1  christos 	moveq	pc,lr			@ be binary compatible with V4, yet
    346  1.1.1.2  christos .word	0xe12fff1e			@ interoperable with Thumb ISA:-)
    347      1.1  christos #endif
    348      1.1  christos .size	gcm_gmult_4bit,.-gcm_gmult_4bit
    349      1.1  christos #if __ARM_MAX_ARCH__>=7
    350      1.1  christos .arch	armv7-a
    351      1.1  christos .fpu	neon
    352      1.1  christos 
    353  1.1.1.2  christos .globl	gcm_init_neon
    354      1.1  christos .type	gcm_init_neon,%function
    355      1.1  christos .align	4
    356      1.1  christos gcm_init_neon:
    357  1.1.1.2  christos 	vld1.64	d7,[r1]!		@ load H
    358  1.1.1.2  christos 	vmov.i8	q8,#0xe1
    359  1.1.1.2  christos 	vld1.64	d6,[r1]
    360      1.1  christos 	vshl.i64	d17,#57
    361      1.1  christos 	vshr.u64	d16,#63		@ t0=0xc2....01
    362  1.1.1.2  christos 	vdup.8	q9,d7[7]
    363      1.1  christos 	vshr.u64	d26,d6,#63
    364  1.1.1.2  christos 	vshr.s8	q9,#7			@ broadcast carry bit
    365      1.1  christos 	vshl.i64	q3,q3,#1
    366  1.1.1.2  christos 	vand	q8,q8,q9
    367  1.1.1.2  christos 	vorr	d7,d26		@ H<<<=1
    368  1.1.1.2  christos 	veor	q3,q3,q8		@ twisted H
    369  1.1.1.2  christos 	vstmia	r0,{q3}
    370      1.1  christos 
    371      1.1  christos 	RET					@ bx lr
    372      1.1  christos .size	gcm_init_neon,.-gcm_init_neon
    373      1.1  christos 
    374  1.1.1.2  christos .globl	gcm_gmult_neon
    375      1.1  christos .type	gcm_gmult_neon,%function
    376      1.1  christos .align	4
    377      1.1  christos gcm_gmult_neon:
    378  1.1.1.2  christos 	vld1.64	d7,[r0]!		@ load Xi
    379  1.1.1.2  christos 	vld1.64	d6,[r0]!
    380      1.1  christos 	vmov.i64	d29,#0x0000ffffffffffff
    381  1.1.1.2  christos 	vldmia	r1,{d26,d27}	@ load twisted H
    382      1.1  christos 	vmov.i64	d30,#0x00000000ffffffff
    383      1.1  christos #ifdef __ARMEL__
    384      1.1  christos 	vrev64.8	q3,q3
    385      1.1  christos #endif
    386      1.1  christos 	vmov.i64	d31,#0x000000000000ffff
    387  1.1.1.2  christos 	veor	d28,d26,d27		@ Karatsuba pre-processing
    388  1.1.1.2  christos 	mov	r3,#16
    389  1.1.1.2  christos 	b	.Lgmult_neon
    390      1.1  christos .size	gcm_gmult_neon,.-gcm_gmult_neon
    391      1.1  christos 
    392  1.1.1.2  christos .globl	gcm_ghash_neon
    393      1.1  christos .type	gcm_ghash_neon,%function
    394      1.1  christos .align	4
    395      1.1  christos gcm_ghash_neon:
    396  1.1.1.2  christos 	vld1.64	d1,[r0]!		@ load Xi
    397  1.1.1.2  christos 	vld1.64	d0,[r0]!
    398      1.1  christos 	vmov.i64	d29,#0x0000ffffffffffff
    399  1.1.1.2  christos 	vldmia	r1,{d26,d27}	@ load twisted H
    400      1.1  christos 	vmov.i64	d30,#0x00000000ffffffff
    401      1.1  christos #ifdef __ARMEL__
    402      1.1  christos 	vrev64.8	q0,q0
    403      1.1  christos #endif
    404      1.1  christos 	vmov.i64	d31,#0x000000000000ffff
    405  1.1.1.2  christos 	veor	d28,d26,d27		@ Karatsuba pre-processing
    406      1.1  christos 
    407      1.1  christos .Loop_neon:
    408  1.1.1.2  christos 	vld1.64	d7,[r2]!		@ load inp
    409  1.1.1.2  christos 	vld1.64	d6,[r2]!
    410      1.1  christos #ifdef __ARMEL__
    411      1.1  christos 	vrev64.8	q3,q3
    412      1.1  christos #endif
    413  1.1.1.2  christos 	veor	q3,q0			@ inp^=Xi
    414      1.1  christos .Lgmult_neon:
    415  1.1.1.2  christos 	vext.8	d16, d26, d26, #1	@ A1
    416      1.1  christos 	vmull.p8	q8, d16, d6		@ F = A1*B
    417  1.1.1.2  christos 	vext.8	d0, d6, d6, #1	@ B1
    418      1.1  christos 	vmull.p8	q0, d26, d0		@ E = A*B1
    419  1.1.1.2  christos 	vext.8	d18, d26, d26, #2	@ A2
    420      1.1  christos 	vmull.p8	q9, d18, d6		@ H = A2*B
    421  1.1.1.2  christos 	vext.8	d22, d6, d6, #2	@ B2
    422      1.1  christos 	vmull.p8	q11, d26, d22		@ G = A*B2
    423  1.1.1.2  christos 	vext.8	d20, d26, d26, #3	@ A3
    424  1.1.1.2  christos 	veor	q8, q8, q0		@ L = E + F
    425      1.1  christos 	vmull.p8	q10, d20, d6		@ J = A3*B
    426  1.1.1.2  christos 	vext.8	d0, d6, d6, #3	@ B3
    427  1.1.1.2  christos 	veor	q9, q9, q11		@ M = G + H
    428      1.1  christos 	vmull.p8	q0, d26, d0		@ I = A*B3
    429  1.1.1.2  christos 	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
    430  1.1.1.2  christos 	vand	d17, d17, d29
    431  1.1.1.2  christos 	vext.8	d22, d6, d6, #4	@ B4
    432  1.1.1.2  christos 	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
    433  1.1.1.2  christos 	vand	d19, d19, d30
    434      1.1  christos 	vmull.p8	q11, d26, d22		@ K = A*B4
    435  1.1.1.2  christos 	veor	q10, q10, q0		@ N = I + J
    436  1.1.1.2  christos 	veor	d16, d16, d17
    437  1.1.1.2  christos 	veor	d18, d18, d19
    438  1.1.1.2  christos 	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
    439  1.1.1.2  christos 	vand	d21, d21, d31
    440  1.1.1.2  christos 	vext.8	q8, q8, q8, #15
    441  1.1.1.2  christos 	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
    442      1.1  christos 	vmov.i64	d23, #0
    443  1.1.1.2  christos 	vext.8	q9, q9, q9, #14
    444  1.1.1.2  christos 	veor	d20, d20, d21
    445      1.1  christos 	vmull.p8	q0, d26, d6		@ D = A*B
    446  1.1.1.2  christos 	vext.8	q11, q11, q11, #12
    447  1.1.1.2  christos 	vext.8	q10, q10, q10, #13
    448  1.1.1.2  christos 	veor	q8, q8, q9
    449  1.1.1.2  christos 	veor	q10, q10, q11
    450  1.1.1.2  christos 	veor	q0, q0, q8
    451  1.1.1.2  christos 	veor	q0, q0, q10
    452  1.1.1.2  christos 	veor	d6,d6,d7	@ Karatsuba pre-processing
    453  1.1.1.2  christos 	vext.8	d16, d28, d28, #1	@ A1
    454      1.1  christos 	vmull.p8	q8, d16, d6		@ F = A1*B
    455  1.1.1.2  christos 	vext.8	d2, d6, d6, #1	@ B1
    456      1.1  christos 	vmull.p8	q1, d28, d2		@ E = A*B1
    457  1.1.1.2  christos 	vext.8	d18, d28, d28, #2	@ A2
    458      1.1  christos 	vmull.p8	q9, d18, d6		@ H = A2*B
    459  1.1.1.2  christos 	vext.8	d22, d6, d6, #2	@ B2
    460      1.1  christos 	vmull.p8	q11, d28, d22		@ G = A*B2
    461  1.1.1.2  christos 	vext.8	d20, d28, d28, #3	@ A3
    462  1.1.1.2  christos 	veor	q8, q8, q1		@ L = E + F
    463      1.1  christos 	vmull.p8	q10, d20, d6		@ J = A3*B
    464  1.1.1.2  christos 	vext.8	d2, d6, d6, #3	@ B3
    465  1.1.1.2  christos 	veor	q9, q9, q11		@ M = G + H
    466      1.1  christos 	vmull.p8	q1, d28, d2		@ I = A*B3
    467  1.1.1.2  christos 	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
    468  1.1.1.2  christos 	vand	d17, d17, d29
    469  1.1.1.2  christos 	vext.8	d22, d6, d6, #4	@ B4
    470  1.1.1.2  christos 	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
    471  1.1.1.2  christos 	vand	d19, d19, d30
    472      1.1  christos 	vmull.p8	q11, d28, d22		@ K = A*B4
    473  1.1.1.2  christos 	veor	q10, q10, q1		@ N = I + J
    474  1.1.1.2  christos 	veor	d16, d16, d17
    475  1.1.1.2  christos 	veor	d18, d18, d19
    476  1.1.1.2  christos 	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
    477  1.1.1.2  christos 	vand	d21, d21, d31
    478  1.1.1.2  christos 	vext.8	q8, q8, q8, #15
    479  1.1.1.2  christos 	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
    480      1.1  christos 	vmov.i64	d23, #0
    481  1.1.1.2  christos 	vext.8	q9, q9, q9, #14
    482  1.1.1.2  christos 	veor	d20, d20, d21
    483      1.1  christos 	vmull.p8	q1, d28, d6		@ D = A*B
    484  1.1.1.2  christos 	vext.8	q11, q11, q11, #12
    485  1.1.1.2  christos 	vext.8	q10, q10, q10, #13
    486  1.1.1.2  christos 	veor	q8, q8, q9
    487  1.1.1.2  christos 	veor	q10, q10, q11
    488  1.1.1.2  christos 	veor	q1, q1, q8
    489  1.1.1.2  christos 	veor	q1, q1, q10
    490  1.1.1.2  christos 	vext.8	d16, d27, d27, #1	@ A1
    491      1.1  christos 	vmull.p8	q8, d16, d7		@ F = A1*B
    492  1.1.1.2  christos 	vext.8	d4, d7, d7, #1	@ B1
    493      1.1  christos 	vmull.p8	q2, d27, d4		@ E = A*B1
    494  1.1.1.2  christos 	vext.8	d18, d27, d27, #2	@ A2
    495      1.1  christos 	vmull.p8	q9, d18, d7		@ H = A2*B
    496  1.1.1.2  christos 	vext.8	d22, d7, d7, #2	@ B2
    497      1.1  christos 	vmull.p8	q11, d27, d22		@ G = A*B2
    498  1.1.1.2  christos 	vext.8	d20, d27, d27, #3	@ A3
    499  1.1.1.2  christos 	veor	q8, q8, q2		@ L = E + F
    500      1.1  christos 	vmull.p8	q10, d20, d7		@ J = A3*B
    501  1.1.1.2  christos 	vext.8	d4, d7, d7, #3	@ B3
    502  1.1.1.2  christos 	veor	q9, q9, q11		@ M = G + H
    503      1.1  christos 	vmull.p8	q2, d27, d4		@ I = A*B3
    504  1.1.1.2  christos 	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
    505  1.1.1.2  christos 	vand	d17, d17, d29
    506  1.1.1.2  christos 	vext.8	d22, d7, d7, #4	@ B4
    507  1.1.1.2  christos 	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
    508  1.1.1.2  christos 	vand	d19, d19, d30
    509      1.1  christos 	vmull.p8	q11, d27, d22		@ K = A*B4
    510  1.1.1.2  christos 	veor	q10, q10, q2		@ N = I + J
    511  1.1.1.2  christos 	veor	d16, d16, d17
    512  1.1.1.2  christos 	veor	d18, d18, d19
    513  1.1.1.2  christos 	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
    514  1.1.1.2  christos 	vand	d21, d21, d31
    515  1.1.1.2  christos 	vext.8	q8, q8, q8, #15
    516  1.1.1.2  christos 	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
    517      1.1  christos 	vmov.i64	d23, #0
    518  1.1.1.2  christos 	vext.8	q9, q9, q9, #14
    519  1.1.1.2  christos 	veor	d20, d20, d21
    520      1.1  christos 	vmull.p8	q2, d27, d7		@ D = A*B
    521  1.1.1.2  christos 	vext.8	q11, q11, q11, #12
    522  1.1.1.2  christos 	vext.8	q10, q10, q10, #13
    523  1.1.1.2  christos 	veor	q8, q8, q9
    524  1.1.1.2  christos 	veor	q10, q10, q11
    525  1.1.1.2  christos 	veor	q2, q2, q8
    526  1.1.1.2  christos 	veor	q2, q2, q10
    527  1.1.1.2  christos 	veor	q1,q1,q0		@ Karatsuba post-processing
    528  1.1.1.2  christos 	veor	q1,q1,q2
    529  1.1.1.2  christos 	veor	d1,d1,d2
    530  1.1.1.2  christos 	veor	d4,d4,d3	@ Xh|Xl - 256-bit result
    531      1.1  christos 
    532      1.1  christos 	@ equivalent of reduction_avx from ghash-x86_64.pl
    533      1.1  christos 	vshl.i64	q9,q0,#57		@ 1st phase
    534      1.1  christos 	vshl.i64	q10,q0,#62
    535  1.1.1.2  christos 	veor	q10,q10,q9		@
    536      1.1  christos 	vshl.i64	q9,q0,#63
    537  1.1.1.2  christos 	veor	q10, q10, q9		@
    538  1.1.1.2  christos 	veor	d1,d1,d20	@
    539  1.1.1.2  christos 	veor	d4,d4,d21
    540      1.1  christos 
    541      1.1  christos 	vshr.u64	q10,q0,#1		@ 2nd phase
    542  1.1.1.2  christos 	veor	q2,q2,q0
    543  1.1.1.2  christos 	veor	q0,q0,q10		@
    544      1.1  christos 	vshr.u64	q10,q10,#6
    545      1.1  christos 	vshr.u64	q0,q0,#1		@
    546  1.1.1.2  christos 	veor	q0,q0,q2		@
    547  1.1.1.2  christos 	veor	q0,q0,q10		@
    548      1.1  christos 
    549  1.1.1.2  christos 	subs	r3,#16
    550  1.1.1.2  christos 	bne	.Loop_neon
    551      1.1  christos 
    552      1.1  christos #ifdef __ARMEL__
    553      1.1  christos 	vrev64.8	q0,q0
    554      1.1  christos #endif
    555  1.1.1.2  christos 	sub	r0,#16
    556  1.1.1.2  christos 	vst1.64	d1,[r0]!		@ write out Xi
    557  1.1.1.2  christos 	vst1.64	d0,[r0]
    558      1.1  christos 
    559      1.1  christos 	RET					@ bx lr
    560      1.1  christos .size	gcm_ghash_neon,.-gcm_ghash_neon
    561      1.1  christos #endif
    562  1.1.1.2  christos .byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
    563  1.1.1.2  christos .align	2
    564  1.1.1.2  christos .align	2
    565