Home | History | Annotate | Line # | Download | only in v6
      1      1.1  mrg dnl  ARM v6 mpn_sqr_basecase.
      2      1.1  mrg 
      3  1.1.1.2  mrg dnl  Contributed to the GNU project by Torbjrn Granlund.
      4      1.1  mrg 
      5  1.1.1.2  mrg dnl  Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
      6      1.1  mrg 
      7      1.1  mrg dnl  This file is part of the GNU MP Library.
      8  1.1.1.2  mrg dnl
      9      1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     10  1.1.1.2  mrg dnl  it under the terms of either:
     11  1.1.1.2  mrg dnl
     12  1.1.1.2  mrg dnl    * the GNU Lesser General Public License as published by the Free
     13  1.1.1.2  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     14  1.1.1.2  mrg dnl      option) any later version.
     15  1.1.1.2  mrg dnl
     16  1.1.1.2  mrg dnl  or
     17  1.1.1.2  mrg dnl
     18  1.1.1.2  mrg dnl    * the GNU General Public License as published by the Free Software
     19  1.1.1.2  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     20  1.1.1.2  mrg dnl      later version.
     21  1.1.1.2  mrg dnl
     22  1.1.1.2  mrg dnl  or both in parallel, as here.
     23  1.1.1.2  mrg dnl
     24      1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     25      1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     26  1.1.1.2  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     27  1.1.1.2  mrg dnl  for more details.
     28  1.1.1.2  mrg dnl
     29  1.1.1.2  mrg dnl  You should have received copies of the GNU General Public License and the
     30  1.1.1.2  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     31  1.1.1.2  mrg dnl  see https://www.gnu.org/licenses/.
     32      1.1  mrg 
     33      1.1  mrg include(`../config.m4')
     34      1.1  mrg 
     35      1.1  mrg C Code structure:
     36      1.1  mrg C
     37      1.1  mrg C
     38      1.1  mrg C        m_2(0m4)        m_2(2m4)        m_2(1m4)        m_2(3m4)
     39      1.1  mrg C           |               |               |               |
     40      1.1  mrg C           |               |               |               |
     41      1.1  mrg C           |               |               |               |
     42      1.1  mrg C          \|/             \|/             \|/             \|/
     43      1.1  mrg C              ____________                   ____________
     44      1.1  mrg C             /            \                 /            \
     45      1.1  mrg C            \|/            \               \|/            \
     46      1.1  mrg C         am_2(3m4)       am_2(1m4)       am_2(0m4)       am_2(2m4)
     47      1.1  mrg C            \            /|\                \            /|\
     48      1.1  mrg C             \____________/                  \____________/
     49      1.1  mrg C                       \                        /
     50      1.1  mrg C                        \                      /
     51      1.1  mrg C                         \                    /
     52  1.1.1.2  mrg C                         cor3             cor2
     53      1.1  mrg C                            \              /
     54      1.1  mrg C                             \            /
     55      1.1  mrg C                            sqr_diag_addlsh1
     56      1.1  mrg 
     57      1.1  mrg C TODO
     58  1.1.1.2  mrg C  * Align more labels.
     59      1.1  mrg C  * Further tweak counter and updates in outer loops.  (This could save
     60      1.1  mrg C    perhaps 5n cycles).
     61  1.1.1.2  mrg C  * Avoid sub-with-lsl in outer loops.  We could keep n up-shifted, then
     62  1.1.1.2  mrg C    initialise loop counter i with a right shift.
     63      1.1  mrg C  * Try to use fewer register.  Perhaps coalesce r9 branch target and n_saved.
     64      1.1  mrg C    (This could save 2-3 cycles for n > 4.)
     65  1.1.1.2  mrg C  * Optimise sqr_diag_addlsh1 loop.  The current code uses old-style carry
     66  1.1.1.2  mrg C    propagation.
     67  1.1.1.2  mrg C  * Stop loops earlier suppressing writes of upper-most rp[] values.
     68  1.1.1.2  mrg C  * The addmul_2 loops here runs well on all cores, but mul_2 runs poorly
     69  1.1.1.2  mrg C    particularly on Cortex-A8.
     70  1.1.1.2  mrg 
     71      1.1  mrg 
     72      1.1  mrg define(`rp',      r0)
     73      1.1  mrg define(`up',      r1)
     74      1.1  mrg define(`n',       r2)
     75      1.1  mrg 
     76      1.1  mrg define(`v0',      r3)
     77      1.1  mrg define(`v1',      r6)
     78      1.1  mrg define(`i',       r8)
     79      1.1  mrg define(`n_saved', r14)
     80      1.1  mrg define(`cya',     r11)
     81      1.1  mrg define(`cyb',     r12)
     82      1.1  mrg define(`u0',      r7)
     83      1.1  mrg define(`u1',      r9)
     84      1.1  mrg 
     85      1.1  mrg ASM_START()
     86      1.1  mrg PROLOGUE(mpn_sqr_basecase)
     87      1.1  mrg 	and	r12, n, #3
     88      1.1  mrg 	cmp	n, #4
     89      1.1  mrg 	addgt	r12, r12, #4
     90      1.1  mrg 	add	pc, pc, r12, lsl #2
     91      1.1  mrg 	nop
     92      1.1  mrg 	b	L(4)
     93      1.1  mrg 	b	L(1)
     94      1.1  mrg 	b	L(2)
     95      1.1  mrg 	b	L(3)
     96      1.1  mrg 	b	L(0m4)
     97      1.1  mrg 	b	L(1m4)
     98      1.1  mrg 	b	L(2m4)
     99      1.1  mrg 	b	L(3m4)
    100      1.1  mrg 
    101      1.1  mrg 
    102  1.1.1.2  mrg L(1m4):	push	{r4-r11, r14}
    103      1.1  mrg 	mov	n_saved, n
    104      1.1  mrg 	sub	i, n, #4
    105      1.1  mrg 	sub	n, n, #2
    106      1.1  mrg 	add	r10, pc, #L(am2_2m4)-.-8
    107      1.1  mrg 	ldm	up, {v0,v1,u0}
    108      1.1  mrg 	sub	up, up, #4
    109      1.1  mrg 	mov	cyb, #0
    110      1.1  mrg 	mov	r5, #0
    111      1.1  mrg 	umull	r4, cya, v1, v0
    112      1.1  mrg 	str	r4, [rp], #-12
    113      1.1  mrg 	mov	r4, #0
    114      1.1  mrg 	b	L(ko0)
    115      1.1  mrg 
    116  1.1.1.2  mrg L(3m4):	push	{r4-r11, r14}
    117      1.1  mrg 	mov	n_saved, n
    118      1.1  mrg 	sub	i, n, #4
    119      1.1  mrg 	sub	n, n, #2
    120      1.1  mrg 	add	r10, pc, #L(am2_0m4)-.-8
    121      1.1  mrg 	ldm	up, {v0,v1,u0}
    122      1.1  mrg 	add	up, up, #4
    123      1.1  mrg 	mov	cyb, #0
    124      1.1  mrg 	mov	r5, #0
    125      1.1  mrg 	umull	r4, cya, v1, v0
    126      1.1  mrg 	str	r4, [rp], #-4
    127      1.1  mrg 	mov	r4, #0
    128      1.1  mrg 	b	L(ko2)
    129      1.1  mrg 
    130  1.1.1.2  mrg L(2m4):	push	{r4-r11, r14}
    131      1.1  mrg 	mov	n_saved, n
    132      1.1  mrg 	sub	i, n, #4
    133      1.1  mrg 	sub	n, n, #2
    134      1.1  mrg 	add	r10, pc, #L(am2_3m4)-.-8
    135      1.1  mrg 	ldm	up, {v0,v1,u1}
    136      1.1  mrg 	mov	cyb, #0
    137      1.1  mrg 	mov	r4, #0
    138      1.1  mrg 	umull	r5, cya, v1, v0
    139      1.1  mrg 	str	r5, [rp], #-8
    140      1.1  mrg 	mov	r5, #0
    141      1.1  mrg 	b	L(ko1)
    142      1.1  mrg 
    143  1.1.1.2  mrg L(0m4):	push	{r4-r11, r14}
    144      1.1  mrg 	mov	n_saved, n
    145      1.1  mrg 	sub	i, n, #4
    146      1.1  mrg 	sub	n, n, #2
    147      1.1  mrg 	add	r10, pc, #L(am2_1m4)-.-8
    148      1.1  mrg 	ldm	up, {v0,v1,u1}
    149      1.1  mrg 	mov	cyb, #0
    150      1.1  mrg 	mov	r4, #0
    151      1.1  mrg 	add	up, up, #8
    152      1.1  mrg 	umull	r5, cya, v1, v0
    153      1.1  mrg 	str	r5, [rp, #0]
    154      1.1  mrg 	mov	r5, #0
    155      1.1  mrg 
    156      1.1  mrg L(top):	ldr	u0, [up, #4]
    157      1.1  mrg 	umaal	r4, cya, u1, v0
    158      1.1  mrg 	str	r4, [rp, #4]
    159      1.1  mrg 	mov	r4, #0
    160      1.1  mrg 	umaal	r5, cyb, u1, v1
    161      1.1  mrg L(ko2):	ldr	u1, [up, #8]
    162      1.1  mrg 	umaal	r5, cya, u0, v0
    163      1.1  mrg 	str	r5, [rp, #8]
    164      1.1  mrg 	mov	r5, #0
    165      1.1  mrg 	umaal	r4, cyb, u0, v1
    166      1.1  mrg L(ko1):	ldr	u0, [up, #12]
    167      1.1  mrg 	umaal	r4, cya, u1, v0
    168      1.1  mrg 	str	r4, [rp, #12]
    169      1.1  mrg 	mov	r4, #0
    170      1.1  mrg 	umaal	r5, cyb, u1, v1
    171      1.1  mrg L(ko0):	ldr	u1, [up, #16]!
    172      1.1  mrg 	umaal	r5, cya, u0, v0
    173      1.1  mrg 	str	r5, [rp, #16]!
    174      1.1  mrg 	mov	r5, #0
    175      1.1  mrg 	umaal	r4, cyb, u0, v1
    176      1.1  mrg 	subs	i, i, #4
    177      1.1  mrg 	bhi	L(top)
    178  1.1.1.2  mrg 
    179  1.1.1.2  mrg 	umaal	r4, cya, u1, v0
    180  1.1.1.2  mrg 	ldr	u0, [up, #4]
    181  1.1.1.2  mrg 	umaal	r5, cyb, u1, v1
    182  1.1.1.2  mrg 	str	r4, [rp, #4]
    183  1.1.1.2  mrg 	umaal	r5, cya, u0, v0
    184  1.1.1.2  mrg 	umaal	cya, cyb, u0, v1
    185  1.1.1.2  mrg 	str	r5, [rp, #8]
    186  1.1.1.2  mrg 	str	cya, [rp, #12]
    187  1.1.1.2  mrg 	str	cyb, [rp, #16]
    188  1.1.1.2  mrg 
    189  1.1.1.2  mrg 	add	up, up, #4
    190  1.1.1.2  mrg 	sub	n, n, #1
    191  1.1.1.2  mrg 	add	rp, rp, #8
    192      1.1  mrg 	bx	r10
    193      1.1  mrg 
    194      1.1  mrg L(evnloop):
    195  1.1.1.2  mrg 	subs	i, n, #6
    196      1.1  mrg 	sub	n, n, #2
    197  1.1.1.2  mrg 	blt	L(cor2)
    198  1.1.1.2  mrg 	ldm	up, {v0,v1,u1}
    199  1.1.1.2  mrg 	add	up, up, #8
    200      1.1  mrg 	mov	cya, #0
    201      1.1  mrg 	mov	cyb, #0
    202  1.1.1.2  mrg 	ldr	r4, [rp, #-4]
    203      1.1  mrg 	umaal	r4, cya, v1, v0
    204  1.1.1.2  mrg 	str	r4, [rp, #-4]
    205  1.1.1.2  mrg 	ldr	r4, [rp, #0]
    206  1.1.1.2  mrg 
    207  1.1.1.2  mrg 	ALIGN(16)
    208  1.1.1.2  mrg L(ua2):	ldr	r5, [rp, #4]
    209      1.1  mrg 	umaal	r4, cya, u1, v0
    210  1.1.1.2  mrg 	ldr	u0, [up, #4]
    211      1.1  mrg 	umaal	r5, cyb, u1, v1
    212  1.1.1.2  mrg 	str	r4, [rp, #0]
    213  1.1.1.2  mrg 	ldr	r4, [rp, #8]
    214      1.1  mrg 	umaal	r5, cya, u0, v0
    215  1.1.1.2  mrg 	ldr	u1, [up, #8]
    216      1.1  mrg 	umaal	r4, cyb, u0, v1
    217  1.1.1.2  mrg 	str	r5, [rp, #4]
    218  1.1.1.2  mrg 	ldr	r5, [rp, #12]
    219      1.1  mrg 	umaal	r4, cya, u1, v0
    220  1.1.1.2  mrg 	ldr	u0, [up, #12]
    221      1.1  mrg 	umaal	r5, cyb, u1, v1
    222  1.1.1.2  mrg 	str	r4, [rp, #8]
    223  1.1.1.2  mrg 	ldr	r4, [rp, #16]!
    224      1.1  mrg 	umaal	r5, cya, u0, v0
    225  1.1.1.2  mrg 	ldr	u1, [up, #16]!
    226      1.1  mrg 	umaal	r4, cyb, u0, v1
    227  1.1.1.2  mrg 	str	r5, [rp, #-4]
    228      1.1  mrg 	subs	i, i, #4
    229  1.1.1.2  mrg 	bhs	L(ua2)
    230  1.1.1.2  mrg 
    231      1.1  mrg 	umaal	r4, cya, u1, v0
    232  1.1.1.2  mrg 	umaal	cya, cyb, u1, v1
    233  1.1.1.2  mrg 	str	r4, [rp, #0]
    234  1.1.1.2  mrg 	str	cya, [rp, #4]
    235  1.1.1.2  mrg 	str	cyb, [rp, #8]
    236  1.1.1.2  mrg L(am2_0m4):
    237      1.1  mrg 	sub	rp, rp, n, lsl #2
    238  1.1.1.2  mrg 	sub	up, up, n, lsl #2
    239  1.1.1.2  mrg 	add	rp, rp, #8
    240  1.1.1.2  mrg 
    241      1.1  mrg 	sub	i, n, #4
    242      1.1  mrg 	sub	n, n, #2
    243  1.1.1.2  mrg 	ldm	up, {v0,v1,u1}
    244      1.1  mrg 	mov	cya, #0
    245      1.1  mrg 	mov	cyb, #0
    246  1.1.1.2  mrg 	ldr	r4, [rp, #4]
    247      1.1  mrg 	umaal	r4, cya, v1, v0
    248  1.1.1.2  mrg 	str	r4, [rp, #4]
    249  1.1.1.2  mrg 	ldr	r4, [rp, #8]
    250      1.1  mrg 	b	L(lo0)
    251  1.1.1.2  mrg 
    252  1.1.1.2  mrg 	ALIGN(16)
    253  1.1.1.2  mrg L(ua0):	ldr	r5, [rp, #4]
    254      1.1  mrg 	umaal	r4, cya, u1, v0
    255  1.1.1.2  mrg 	ldr	u0, [up, #4]
    256      1.1  mrg 	umaal	r5, cyb, u1, v1
    257  1.1.1.2  mrg 	str	r4, [rp, #0]
    258  1.1.1.2  mrg 	ldr	r4, [rp, #8]
    259      1.1  mrg 	umaal	r5, cya, u0, v0
    260  1.1.1.2  mrg 	ldr	u1, [up, #8]
    261      1.1  mrg 	umaal	r4, cyb, u0, v1
    262  1.1.1.2  mrg 	str	r5, [rp, #4]
    263  1.1.1.2  mrg L(lo0):	ldr	r5, [rp, #12]
    264      1.1  mrg 	umaal	r4, cya, u1, v0
    265  1.1.1.2  mrg 	ldr	u0, [up, #12]
    266      1.1  mrg 	umaal	r5, cyb, u1, v1
    267  1.1.1.2  mrg 	str	r4, [rp, #8]
    268  1.1.1.2  mrg 	ldr	r4, [rp, #16]!
    269      1.1  mrg 	umaal	r5, cya, u0, v0
    270  1.1.1.2  mrg 	ldr	u1, [up, #16]!
    271      1.1  mrg 	umaal	r4, cyb, u0, v1
    272  1.1.1.2  mrg 	str	r5, [rp, #-4]
    273      1.1  mrg 	subs	i, i, #4
    274  1.1.1.2  mrg 	bhs	L(ua0)
    275  1.1.1.2  mrg 
    276      1.1  mrg 	umaal	r4, cya, u1, v0
    277  1.1.1.2  mrg 	umaal	cya, cyb, u1, v1
    278  1.1.1.2  mrg 	str	r4, [rp, #0]
    279  1.1.1.2  mrg 	str	cya, [rp, #4]
    280  1.1.1.2  mrg 	str	cyb, [rp, #8]
    281  1.1.1.2  mrg L(am2_2m4):
    282      1.1  mrg 	sub	rp, rp, n, lsl #2
    283  1.1.1.2  mrg 	sub	up, up, n, lsl #2
    284  1.1.1.2  mrg 	add	rp, rp, #16
    285      1.1  mrg 	b	L(evnloop)
    286      1.1  mrg 
    287      1.1  mrg 
    288      1.1  mrg L(oddloop):
    289  1.1.1.2  mrg 	sub	i, n, #5
    290      1.1  mrg 	sub	n, n, #2
    291  1.1.1.2  mrg 	ldm	up, {v0,v1,u0}
    292      1.1  mrg 	mov	cya, #0
    293      1.1  mrg 	mov	cyb, #0
    294  1.1.1.2  mrg 	ldr	r5, [rp, #0]
    295      1.1  mrg 	umaal	r5, cya, v1, v0
    296  1.1.1.2  mrg 	str	r5, [rp, #0]
    297  1.1.1.2  mrg 	ldr	r5, [rp, #4]
    298  1.1.1.2  mrg 	add	up, up, #4
    299      1.1  mrg 	b	L(lo1)
    300  1.1.1.2  mrg 
    301  1.1.1.2  mrg 	ALIGN(16)
    302  1.1.1.2  mrg L(ua1):	ldr	r5, [rp, #4]
    303      1.1  mrg 	umaal	r4, cya, u1, v0
    304  1.1.1.2  mrg 	ldr	u0, [up, #4]
    305      1.1  mrg 	umaal	r5, cyb, u1, v1
    306  1.1.1.2  mrg 	str	r4, [rp, #0]
    307  1.1.1.2  mrg L(lo1):	ldr	r4, [rp, #8]
    308      1.1  mrg 	umaal	r5, cya, u0, v0
    309  1.1.1.2  mrg 	ldr	u1, [up, #8]
    310      1.1  mrg 	umaal	r4, cyb, u0, v1
    311  1.1.1.2  mrg 	str	r5, [rp, #4]
    312  1.1.1.2  mrg 	ldr	r5, [rp, #12]
    313      1.1  mrg 	umaal	r4, cya, u1, v0
    314  1.1.1.2  mrg 	ldr	u0, [up, #12]
    315      1.1  mrg 	umaal	r5, cyb, u1, v1
    316  1.1.1.2  mrg 	str	r4, [rp, #8]
    317  1.1.1.2  mrg 	ldr	r4, [rp, #16]!
    318      1.1  mrg 	umaal	r5, cya, u0, v0
    319  1.1.1.2  mrg 	ldr	u1, [up, #16]!
    320      1.1  mrg 	umaal	r4, cyb, u0, v1
    321  1.1.1.2  mrg 	str	r5, [rp, #-4]
    322      1.1  mrg 	subs	i, i, #4
    323  1.1.1.2  mrg 	bhs	L(ua1)
    324  1.1.1.2  mrg 
    325      1.1  mrg 	umaal	r4, cya, u1, v0
    326  1.1.1.2  mrg 	umaal	cya, cyb, u1, v1
    327  1.1.1.2  mrg 	str	r4, [rp, #0]
    328  1.1.1.2  mrg 	str	cya, [rp, #4]
    329  1.1.1.2  mrg 	str	cyb, [rp, #8]
    330  1.1.1.2  mrg L(am2_3m4):
    331      1.1  mrg 	sub	rp, rp, n, lsl #2
    332  1.1.1.2  mrg 	sub	up, up, n, lsl #2
    333  1.1.1.2  mrg 	add	rp, rp, #4
    334  1.1.1.2  mrg 
    335  1.1.1.2  mrg 	subs	i, n, #3
    336  1.1.1.2  mrg 	beq	L(cor3)
    337      1.1  mrg 	sub	n, n, #2
    338  1.1.1.2  mrg 	ldm	up, {v0,v1,u0}
    339      1.1  mrg 	mov	cya, #0
    340      1.1  mrg 	mov	cyb, #0
    341      1.1  mrg 	ldr	r5, [rp, #8]
    342  1.1.1.2  mrg 	sub	up, up, #4
    343  1.1.1.2  mrg 	umaal	r5, cya, v1, v0
    344  1.1.1.2  mrg 	str	r5, [rp, #8]
    345  1.1.1.2  mrg 	ldr	r5, [rp, #12]
    346  1.1.1.2  mrg 	b	L(lo3)
    347  1.1.1.2  mrg 
    348  1.1.1.2  mrg 	ALIGN(16)
    349  1.1.1.2  mrg L(ua3):	ldr	r5, [rp, #4]
    350      1.1  mrg 	umaal	r4, cya, u1, v0
    351  1.1.1.2  mrg 	ldr	u0, [up, #4]
    352      1.1  mrg 	umaal	r5, cyb, u1, v1
    353  1.1.1.2  mrg 	str	r4, [rp, #0]
    354  1.1.1.2  mrg 	ldr	r4, [rp, #8]
    355      1.1  mrg 	umaal	r5, cya, u0, v0
    356  1.1.1.2  mrg 	ldr	u1, [up, #8]
    357      1.1  mrg 	umaal	r4, cyb, u0, v1
    358  1.1.1.2  mrg 	str	r5, [rp, #4]
    359  1.1.1.2  mrg 	ldr	r5, [rp, #12]
    360      1.1  mrg 	umaal	r4, cya, u1, v0
    361  1.1.1.2  mrg 	ldr	u0, [up, #12]
    362      1.1  mrg 	umaal	r5, cyb, u1, v1
    363  1.1.1.2  mrg 	str	r4, [rp, #8]
    364  1.1.1.2  mrg L(lo3):	ldr	r4, [rp, #16]!
    365      1.1  mrg 	umaal	r5, cya, u0, v0
    366  1.1.1.2  mrg 	ldr	u1, [up, #16]!
    367      1.1  mrg 	umaal	r4, cyb, u0, v1
    368  1.1.1.2  mrg 	str	r5, [rp, #-4]
    369      1.1  mrg 	subs	i, i, #4
    370  1.1.1.2  mrg 	bhs	L(ua3)
    371  1.1.1.2  mrg 
    372      1.1  mrg 	umaal	r4, cya, u1, v0
    373  1.1.1.2  mrg 	umaal	cya, cyb, u1, v1
    374  1.1.1.2  mrg 	str	r4, [rp, #0]
    375  1.1.1.2  mrg 	str	cya, [rp, #4]
    376  1.1.1.2  mrg 	str	cyb, [rp, #8]
    377  1.1.1.2  mrg L(am2_1m4):
    378      1.1  mrg 	sub	rp, rp, n, lsl #2
    379  1.1.1.2  mrg 	sub	up, up, n, lsl #2
    380  1.1.1.2  mrg 	add	rp, rp, #12
    381      1.1  mrg 	b	L(oddloop)
    382      1.1  mrg 
    383  1.1.1.2  mrg 
    384  1.1.1.2  mrg L(cor3):ldm	up, {v0,v1,u0}
    385  1.1.1.2  mrg 	ldr	r5, [rp, #8]
    386  1.1.1.2  mrg 	mov	cya, #0
    387      1.1  mrg 	mov	cyb, #0
    388  1.1.1.2  mrg 	umaal	r5, cya, v1, v0
    389  1.1.1.2  mrg 	str	r5, [rp, #8]
    390  1.1.1.2  mrg 	ldr	r5, [rp, #12]
    391  1.1.1.2  mrg 	ldr	r4, [rp, #16]
    392  1.1.1.2  mrg 	umaal	r5, cya, u0, v0
    393  1.1.1.2  mrg 	ldr	u1, [up, #12]
    394  1.1.1.2  mrg 	umaal	r4, cyb, u0, v1
    395  1.1.1.2  mrg 	str	r5, [rp, #12]
    396  1.1.1.2  mrg 	umaal	r4, cya, u1, v0
    397  1.1.1.2  mrg 	umaal	cya, cyb, u1, v1
    398  1.1.1.2  mrg 	str	r4, [rp, #16]
    399  1.1.1.2  mrg 	str	cya, [rp, #20]
    400  1.1.1.2  mrg 	str	cyb, [rp, #24]
    401  1.1.1.2  mrg 	add	up, up, #16
    402  1.1.1.2  mrg 	mov	cya, cyb
    403  1.1.1.2  mrg 	adds	rp, rp, #36		C clear cy
    404  1.1.1.2  mrg 	mov	cyb, #0
    405  1.1.1.2  mrg 	umaal	cya, cyb, u1, u0
    406      1.1  mrg 	b	L(sqr_diag_addlsh1)
    407      1.1  mrg 
    408  1.1.1.2  mrg L(cor2):
    409  1.1.1.2  mrg 	ldm	up!, {v0,v1,u0}
    410  1.1.1.2  mrg 	mov	r4, cya
    411  1.1.1.2  mrg 	mov	r5, cyb
    412      1.1  mrg 	mov	cya, #0
    413      1.1  mrg 	umaal	r4, cya, v1, v0
    414  1.1.1.2  mrg 	mov	cyb, #0
    415      1.1  mrg 	umaal	r5, cya, u0, v0
    416  1.1.1.2  mrg 	strd	r4, r5, [rp, #-4]
    417      1.1  mrg 	umaal	cya, cyb, u0, v1
    418  1.1.1.2  mrg 	add	rp, rp, #16
    419      1.1  mrg C	b	L(sqr_diag_addlsh1)
    420      1.1  mrg 
    421      1.1  mrg 
    422      1.1  mrg define(`w0',  r6)
    423      1.1  mrg define(`w1',  r7)
    424      1.1  mrg define(`w2',  r8)
    425      1.1  mrg define(`rbx', r9)
    426      1.1  mrg 
    427      1.1  mrg L(sqr_diag_addlsh1):
    428      1.1  mrg 	str	cya, [rp, #-12]
    429      1.1  mrg 	str	cyb, [rp, #-8]
    430      1.1  mrg 	sub	n, n_saved, #1
    431      1.1  mrg 	sub	up, up, n_saved, lsl #2
    432      1.1  mrg 	sub	rp, rp, n_saved, lsl #3
    433      1.1  mrg 	ldr	r3, [up], #4
    434      1.1  mrg 	umull	w1, r5, r3, r3
    435      1.1  mrg 	mov	w2, #0
    436  1.1.1.2  mrg 	mov	r10, #0
    437  1.1.1.2  mrg C	cmn	r0, #0			C clear cy (already clear)
    438      1.1  mrg 	b	L(lm)
    439      1.1  mrg 
    440      1.1  mrg L(tsd):	adds	w0, w0, rbx
    441      1.1  mrg 	adcs	w1, w1, r4
    442      1.1  mrg 	str	w0, [rp, #0]
    443      1.1  mrg L(lm):	ldr	w0, [rp, #4]
    444      1.1  mrg 	str	w1, [rp, #4]
    445      1.1  mrg 	ldr	w1, [rp, #8]!
    446      1.1  mrg 	add	rbx, r5, w2
    447      1.1  mrg 	adcs	w0, w0, w0
    448      1.1  mrg 	ldr	r3, [up], #4
    449      1.1  mrg 	adcs	w1, w1, w1
    450  1.1.1.2  mrg 	adc	w2, r10, r10
    451      1.1  mrg 	umull	r4, r5, r3, r3
    452      1.1  mrg 	subs	n, n, #1
    453      1.1  mrg 	bne	L(tsd)
    454      1.1  mrg 
    455      1.1  mrg 	adds	w0, w0, rbx
    456      1.1  mrg 	adcs	w1, w1, r4
    457      1.1  mrg 	adc	w2, r5, w2
    458      1.1  mrg 	stm	rp, {w0,w1,w2}
    459      1.1  mrg 
    460  1.1.1.2  mrg 	pop	{r4-r11, pc}
    461      1.1  mrg 
    462      1.1  mrg 
    463      1.1  mrg C Straight line code for n <= 4
    464      1.1  mrg 
    465      1.1  mrg L(1):	ldr	r3, [up, #0]
    466      1.1  mrg 	umull	r1, r2, r3, r3
    467      1.1  mrg 	stm	rp, {r1,r2}
    468      1.1  mrg 	bx	r14
    469      1.1  mrg 
    470      1.1  mrg L(2):	push	{r4-r5}
    471      1.1  mrg 	ldm	up, {r5,r12}
    472      1.1  mrg 	umull	r1, r2, r5, r5
    473      1.1  mrg 	umull	r3, r4, r12, r12
    474      1.1  mrg 	umull	r5, r12, r5, r12
    475      1.1  mrg 	adds	r5, r5, r5
    476      1.1  mrg 	adcs	r12, r12, r12
    477      1.1  mrg 	adc	r4, r4, #0
    478      1.1  mrg 	adds	r2, r2, r5
    479      1.1  mrg 	adcs	r3, r3, r12
    480      1.1  mrg 	adc	r4, r4, #0
    481      1.1  mrg 	stm	rp, {r1,r2,r3,r4}
    482      1.1  mrg 	pop	{r4-r5}
    483      1.1  mrg 	bx	r14
    484      1.1  mrg 
    485      1.1  mrg L(3):	push	{r4-r11}
    486      1.1  mrg 	ldm	up, {r7,r8,r9}
    487      1.1  mrg 	umull	r1, r2, r7, r7
    488      1.1  mrg 	umull	r3, r4, r8, r8
    489      1.1  mrg 	umull	r5, r6, r9, r9
    490      1.1  mrg 	umull	r10, r11, r7, r8
    491      1.1  mrg 	mov	r12, #0
    492      1.1  mrg 	umlal	r11, r12, r7, r9
    493      1.1  mrg 	mov	r7, #0
    494      1.1  mrg 	umlal	r12, r7, r8, r9
    495      1.1  mrg 	adds	r10, r10, r10
    496      1.1  mrg 	adcs	r11, r11, r11
    497      1.1  mrg 	adcs	r12, r12, r12
    498      1.1  mrg 	adcs	r7, r7, r7
    499      1.1  mrg 	adc	r6, r6, #0
    500      1.1  mrg 	adds	r2, r2, r10
    501      1.1  mrg 	adcs	r3, r3, r11
    502      1.1  mrg 	adcs	r4, r4, r12
    503      1.1  mrg 	adcs	r5, r5, r7
    504      1.1  mrg 	adc	r6, r6, #0
    505      1.1  mrg 	stm	rp, {r1,r2,r3,r4,r5,r6}
    506      1.1  mrg 	pop	{r4-r11}
    507      1.1  mrg 	bx	r14
    508      1.1  mrg 
    509      1.1  mrg L(4):	push	{r4-r11, r14}
    510      1.1  mrg 	ldm	up, {r9,r10,r11,r12}
    511      1.1  mrg 	umull	r1, r2, r9, r9
    512      1.1  mrg 	umull	r3, r4, r10, r10
    513      1.1  mrg 	umull	r5, r6, r11, r11
    514      1.1  mrg 	umull	r7, r8, r12, r12
    515      1.1  mrg 	stm	rp, {r1,r2,r3,r4,r5,r6,r7}
    516      1.1  mrg 	umull	r1, r2, r9, r10
    517      1.1  mrg 	mov	r3, #0
    518      1.1  mrg 	umlal	r2, r3, r9, r11
    519      1.1  mrg 	mov	r4, #0
    520      1.1  mrg 	umlal	r3, r4, r9, r12
    521      1.1  mrg 	mov	r5, #0
    522      1.1  mrg 	umlal	r3, r5, r10, r11
    523      1.1  mrg 	umaal	r4, r5, r10, r12
    524      1.1  mrg 	mov	r6, #0
    525      1.1  mrg 	umlal	r5, r6, r11, r12
    526      1.1  mrg 	adds	r1, r1, r1
    527      1.1  mrg 	adcs	r2, r2, r2
    528      1.1  mrg 	adcs	r3, r3, r3
    529      1.1  mrg 	adcs	r4, r4, r4
    530      1.1  mrg 	adcs	r5, r5, r5
    531      1.1  mrg 	adcs	r6, r6, r6
    532      1.1  mrg 	add	rp, rp, #4
    533  1.1.1.2  mrg 	adc	r7, r8, #0
    534      1.1  mrg 	ldm	rp, {r8,r9,r10,r11,r12,r14}
    535      1.1  mrg 	adds	r1, r1, r8
    536      1.1  mrg 	adcs	r2, r2, r9
    537      1.1  mrg 	adcs	r3, r3, r10
    538      1.1  mrg 	adcs	r4, r4, r11
    539      1.1  mrg 	adcs	r5, r5, r12
    540      1.1  mrg 	adcs	r6, r6, r14
    541      1.1  mrg 	adc	r7, r7, #0
    542      1.1  mrg 	stm	rp, {r1,r2,r3,r4,r5,r6,r7}
    543      1.1  mrg 	pop	{r4-r11, pc}
    544      1.1  mrg EPILOGUE()
    545