Home | History | Annotate | Line # | Download | only in v9
      1      1.1  mrg dnl  SPARC v9 32-bit mpn_sqr_diagonal.
      2      1.1  mrg 
      3      1.1  mrg dnl  Copyright 2001, 2003 Free Software Foundation, Inc.
      4      1.1  mrg 
      5      1.1  mrg dnl  This file is part of the GNU MP Library.
      6  1.1.1.2  mrg dnl
      7      1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      8  1.1.1.2  mrg dnl  it under the terms of either:
      9  1.1.1.2  mrg dnl
     10  1.1.1.2  mrg dnl    * the GNU Lesser General Public License as published by the Free
     11  1.1.1.2  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     12  1.1.1.2  mrg dnl      option) any later version.
     13  1.1.1.2  mrg dnl
     14  1.1.1.2  mrg dnl  or
     15  1.1.1.2  mrg dnl
     16  1.1.1.2  mrg dnl    * the GNU General Public License as published by the Free Software
     17  1.1.1.2  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     18  1.1.1.2  mrg dnl      later version.
     19  1.1.1.2  mrg dnl
     20  1.1.1.2  mrg dnl  or both in parallel, as here.
     21  1.1.1.2  mrg dnl
     22      1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     23      1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     24  1.1.1.2  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     25  1.1.1.2  mrg dnl  for more details.
     26  1.1.1.2  mrg dnl
     27  1.1.1.2  mrg dnl  You should have received copies of the GNU General Public License and the
     28  1.1.1.2  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     29  1.1.1.2  mrg dnl  see https://www.gnu.org/licenses/.
     30      1.1  mrg 
     31      1.1  mrg 
     32      1.1  mrg include(`../config.m4')
     33      1.1  mrg 
     34      1.1  mrg C INPUT PARAMETERS
     35      1.1  mrg C rp	i0
     36      1.1  mrg C up	i1
     37      1.1  mrg C n	i2
     38      1.1  mrg 
     39      1.1  mrg C This code uses a very deep software pipeline, due to the need for moving data
     40      1.1  mrg C forth and back between the integer registers and floating-point registers.
     41      1.1  mrg C
     42      1.1  mrg C A VIS variant of this code would make the pipeline less deep, since the
     43      1.1  mrg C masking now done in the integer unit could take place in the floating-point
     44      1.1  mrg C unit using the FAND instruction.  It would be possible to save several cycles
     45      1.1  mrg C too.
     46      1.1  mrg C
     47      1.1  mrg C On UltraSPARC 1 and 2, this code runs at 11 cycles/limb from the Dcache and
     48      1.1  mrg C not much slower from the Ecache.  It would perhaps be possible to shave off
     49      1.1  mrg C one cycle, but not easily.  We cannot do better than 10 cycles/limb with the
     50      1.1  mrg C used instructions, since we have 10 memory operations per limb.  But a VIS
     51      1.1  mrg C variant could run three cycles faster than the corresponding non-VIS code.
     52      1.1  mrg 
     53      1.1  mrg C This is non-pipelined code showing the algorithm:
     54      1.1  mrg C
     55      1.1  mrg C .Loop:
     56      1.1  mrg C	lduw	[up+0],%g4		C 00000000hhhhllll
     57      1.1  mrg C	sllx	%g4,16,%g3		C 0000hhhhllll0000
     58      1.1  mrg C	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
     59      1.1  mrg C	andn	%g2,%g5,%g2		C 0000hhhh0000llll
     60      1.1  mrg C	stx	%g2,[%fp+80]
     61      1.1  mrg C	ldd	[%fp+80],%f0
     62      1.1  mrg C	fitod	%f0,%f4			C hi16
     63      1.1  mrg C	fitod	%f1,%f6			C lo16
     64      1.1  mrg C	ld	[up+0],%f9
     65      1.1  mrg C	fxtod	%f8,%f2
     66      1.1  mrg C	fmuld	%f2,%f4,%f4
     67      1.1  mrg C	fmuld	%f2,%f6,%f6
     68      1.1  mrg C	fdtox	%f4,%f4
     69      1.1  mrg C	fdtox	%f6,%f6
     70      1.1  mrg C	std	%f4,[%fp-24]
     71      1.1  mrg C	std	%f6,[%fp-16]
     72      1.1  mrg C	ldx	[%fp-24],%g2
     73      1.1  mrg C	ldx	[%fp-16],%g1
     74      1.1  mrg C	sllx	%g2,16,%g2
     75      1.1  mrg C	add	%g2,%g1,%g1
     76      1.1  mrg C	stw	%g1,[rp+0]
     77      1.1  mrg C	srlx	%g1,32,%l0
     78      1.1  mrg C	stw	%l0,[rp+4]
     79      1.1  mrg C	add	up,4,up
     80      1.1  mrg C	subcc	n,1,n
     81      1.1  mrg C	bne,pt	%icc,.Loop
     82      1.1  mrg C	add	rp,8,rp
     83      1.1  mrg 
     84      1.1  mrg define(`fanop',`fitod %f12,%f10')	dnl  A quasi nop running in the FA pipe
     85      1.1  mrg 
     86      1.1  mrg ASM_START()
     87      1.1  mrg 
     88      1.1  mrg 	TEXT
     89      1.1  mrg 	ALIGN(4)
     90      1.1  mrg .Lnoll:
     91      1.1  mrg 	.word	0
     92      1.1  mrg 
     93      1.1  mrg PROLOGUE(mpn_sqr_diagonal)
     94      1.1  mrg 	save	%sp,-256,%sp
     95      1.1  mrg 
     96      1.1  mrg ifdef(`PIC',
     97      1.1  mrg `.Lpc:	rd	%pc,%o7
     98      1.1  mrg 	ld	[%o7+.Lnoll-.Lpc],%f8',
     99      1.1  mrg `	sethi	%hi(.Lnoll),%g1
    100      1.1  mrg 	ld	[%g1+%lo(.Lnoll)],%f8')
    101      1.1  mrg 
    102      1.1  mrg 	sethi	%hi(0xffff0000),%g5
    103      1.1  mrg 	add	%i1,-8,%i1
    104      1.1  mrg 
    105      1.1  mrg 	lduw	[%i1+8],%g4
    106      1.1  mrg 	add	%i1,4,%i1		C s1_ptr++
    107      1.1  mrg 	sllx	%g4,16,%g3		C 0000hhhhllll0000
    108      1.1  mrg 	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
    109      1.1  mrg 	subcc	%i2,1,%i2
    110      1.1  mrg 	bne,pt	%icc,.L_grt_1
    111      1.1  mrg 	andn	%g2,%g5,%g2		C 0000hhhh0000llll
    112      1.1  mrg 
    113      1.1  mrg 	add	%i1,4,%i1		C s1_ptr++
    114      1.1  mrg 	stx	%g2,[%fp+80]
    115      1.1  mrg 	ld	[%i1],%f9
    116      1.1  mrg 	ldd	[%fp+80],%f0
    117      1.1  mrg 	fxtod	%f8,%f2
    118      1.1  mrg 	fitod	%f0,%f4
    119      1.1  mrg 	fitod	%f1,%f6
    120      1.1  mrg 	fmuld	%f2,%f4,%f4
    121      1.1  mrg 	fmuld	%f2,%f6,%f6
    122      1.1  mrg 	fdtox	%f4,%f4
    123      1.1  mrg 	fdtox	%f6,%f6
    124      1.1  mrg 	std	%f4,[%fp-24]
    125      1.1  mrg 	std	%f6,[%fp-16]
    126      1.1  mrg 
    127      1.1  mrg 	add	%fp, 80, %l3
    128      1.1  mrg 	add	%fp, -24, %l4
    129      1.1  mrg 	add	%fp, 72, %l5
    130      1.1  mrg 	b	.L1
    131      1.1  mrg 	add	%fp, -40, %l6
    132      1.1  mrg 
    133      1.1  mrg .L_grt_1:
    134      1.1  mrg 	stx	%g2,[%fp+80]
    135      1.1  mrg 	lduw	[%i1+8],%g4
    136      1.1  mrg 	add	%i1,4,%i1		C s1_ptr++
    137      1.1  mrg 	sllx	%g4,16,%g3		C 0000hhhhllll0000
    138      1.1  mrg 	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
    139      1.1  mrg 	subcc	%i2,1,%i2
    140      1.1  mrg 	bne,pt	%icc,.L_grt_2
    141      1.1  mrg 	andn	%g2,%g5,%g2		C 0000hhhh0000llll
    142      1.1  mrg 
    143      1.1  mrg 	stx	%g2,[%fp+72]
    144      1.1  mrg 	ld	[%i1],%f9
    145      1.1  mrg 	add	%i1,4,%i1		C s1_ptr++
    146      1.1  mrg 	ldd	[%fp+80],%f0
    147      1.1  mrg 	fxtod	%f8,%f2
    148      1.1  mrg 	fitod	%f0,%f4
    149      1.1  mrg 	fitod	%f1,%f6
    150      1.1  mrg 	fmuld	%f2,%f4,%f4
    151      1.1  mrg 	ld	[%i1],%f9
    152      1.1  mrg 	fmuld	%f2,%f6,%f6
    153      1.1  mrg 	ldd	[%fp+72],%f0
    154      1.1  mrg 	fdtox	%f4,%f4
    155      1.1  mrg 	fdtox	%f6,%f6
    156      1.1  mrg 	std	%f4,[%fp-24]
    157      1.1  mrg 	fxtod	%f8,%f2
    158      1.1  mrg 	std	%f6,[%fp-16]
    159      1.1  mrg 	fitod	%f0,%f4
    160      1.1  mrg 	fitod	%f1,%f6
    161      1.1  mrg 	fmuld	%f2,%f4,%f4
    162      1.1  mrg 	fmuld	%f2,%f6,%f6
    163      1.1  mrg 	fdtox	%f4,%f4
    164      1.1  mrg 
    165      1.1  mrg 	add	%fp, 72, %l3
    166      1.1  mrg 	add	%fp, -40, %l4
    167      1.1  mrg 	add	%fp, 80, %l5
    168      1.1  mrg 	b	.L2
    169      1.1  mrg 	add	%fp, -24, %l6
    170      1.1  mrg 
    171      1.1  mrg .L_grt_2:
    172      1.1  mrg 	stx	%g2,[%fp+72]
    173      1.1  mrg 	lduw	[%i1+8],%g4
    174      1.1  mrg 	ld	[%i1],%f9
    175      1.1  mrg 	add	%i1,4,%i1		C s1_ptr++
    176      1.1  mrg 	ldd	[%fp+80],%f0
    177      1.1  mrg 	sllx	%g4,16,%g3		C 0000hhhhllll0000
    178      1.1  mrg 	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
    179      1.1  mrg 	subcc	%i2,1,%i2
    180      1.1  mrg 	fxtod	%f8,%f2
    181      1.1  mrg 	bne,pt	%icc,.L_grt_3
    182      1.1  mrg 	andn	%g2,%g5,%g2		C 0000hhhh0000llll
    183      1.1  mrg 
    184      1.1  mrg 	stx	%g2,[%fp+80]
    185      1.1  mrg 	fitod	%f0,%f4
    186      1.1  mrg 	fitod	%f1,%f6
    187      1.1  mrg 	fmuld	%f2,%f4,%f4
    188      1.1  mrg 	ld	[%i1],%f9
    189      1.1  mrg 	fmuld	%f2,%f6,%f6
    190      1.1  mrg 	add	%i1,4,%i1		C s1_ptr++
    191      1.1  mrg 	ldd	[%fp+72],%f0
    192      1.1  mrg 	fdtox	%f4,%f4
    193      1.1  mrg 	fdtox	%f6,%f6
    194      1.1  mrg 	std	%f4,[%fp-24]
    195      1.1  mrg 	fxtod	%f8,%f2
    196      1.1  mrg 	std	%f6,[%fp-16]
    197      1.1  mrg 	fitod	%f0,%f4
    198      1.1  mrg 	fitod	%f1,%f6
    199      1.1  mrg 	fmuld	%f2,%f4,%f4
    200      1.1  mrg 	ld	[%i1],%f9
    201      1.1  mrg 	add	%fp, 80, %l3
    202      1.1  mrg 	fmuld	%f2,%f6,%f6
    203      1.1  mrg 	add	%fp, -24, %l4
    204      1.1  mrg 	ldd	[%fp+80],%f0
    205      1.1  mrg 	add	%fp, 72, %l5
    206      1.1  mrg 	fdtox	%f4,%f4
    207      1.1  mrg 	b	.L3
    208      1.1  mrg 	add	%fp, -40, %l6
    209      1.1  mrg 
    210      1.1  mrg .L_grt_3:
    211      1.1  mrg 	stx	%g2,[%fp+80]
    212      1.1  mrg 	fitod	%f0,%f4
    213      1.1  mrg 	lduw	[%i1+8],%g4
    214      1.1  mrg 	fitod	%f1,%f6
    215      1.1  mrg 	fmuld	%f2,%f4,%f4
    216      1.1  mrg 	ld	[%i1],%f9
    217      1.1  mrg 	fmuld	%f2,%f6,%f6
    218      1.1  mrg 	add	%i1,4,%i1		C s1_ptr++
    219      1.1  mrg 	ldd	[%fp+72],%f0
    220      1.1  mrg 	fdtox	%f4,%f4
    221      1.1  mrg 	sllx	%g4,16,%g3		C 0000hhhhllll0000
    222      1.1  mrg 	fdtox	%f6,%f6
    223      1.1  mrg 	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
    224      1.1  mrg 	subcc	%i2,1,%i2
    225      1.1  mrg 	std	%f4,[%fp-24]
    226      1.1  mrg 	fxtod	%f8,%f2
    227      1.1  mrg 	std	%f6,[%fp-16]
    228      1.1  mrg 	bne,pt	%icc,.L_grt_4
    229      1.1  mrg 	andn	%g2,%g5,%g2		C 0000hhhh0000llll
    230      1.1  mrg 
    231      1.1  mrg 	stx	%g2,[%fp+72]
    232      1.1  mrg 	fitod	%f0,%f4
    233      1.1  mrg 	fitod	%f1,%f6
    234      1.1  mrg 	add	%fp, 72, %l3
    235      1.1  mrg 	fmuld	%f2,%f4,%f4
    236      1.1  mrg 	add	%fp, -40, %l4
    237      1.1  mrg 	ld	[%i1],%f9
    238      1.1  mrg 	fmuld	%f2,%f6,%f6
    239      1.1  mrg 	add	%i1,4,%i1		C s1_ptr++
    240      1.1  mrg 	ldd	[%fp+80],%f0
    241      1.1  mrg 	add	%fp, 80, %l5
    242      1.1  mrg 	fdtox	%f4,%f4
    243      1.1  mrg 	b	.L4
    244      1.1  mrg 	add	%fp, -24, %l6
    245      1.1  mrg 
    246      1.1  mrg .L_grt_4:
    247      1.1  mrg 	stx	%g2,[%fp+72]
    248      1.1  mrg 	fitod	%f0,%f4
    249      1.1  mrg 	lduw	[%i1+8],%g4
    250      1.1  mrg 	fitod	%f1,%f6
    251      1.1  mrg 	fmuld	%f2,%f4,%f4
    252      1.1  mrg 	ld	[%i1],%f9
    253      1.1  mrg 	fmuld	%f2,%f6,%f6
    254      1.1  mrg 	add	%i1,4,%i1		C s1_ptr++
    255      1.1  mrg 	ldd	[%fp+80],%f0
    256      1.1  mrg 	fdtox	%f4,%f4
    257      1.1  mrg 	sllx	%g4,16,%g3		C 0000hhhhllll0000
    258      1.1  mrg 	fdtox	%f6,%f6
    259      1.1  mrg 	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
    260      1.1  mrg 	subcc	%i2,1,%i2
    261      1.1  mrg 	std	%f4,[%fp-40]
    262      1.1  mrg 	fxtod	%f8,%f2
    263      1.1  mrg 	std	%f6,[%fp-32]
    264      1.1  mrg 	be,pn	%icc,.L5
    265      1.1  mrg 	andn	%g2,%g5,%g2		C 0000hhhh0000llll
    266      1.1  mrg 
    267      1.1  mrg 	b,a	.Loop
    268      1.1  mrg 
    269      1.1  mrg 	.align	16
    270      1.1  mrg C --- LOOP BEGIN
    271      1.1  mrg .Loop:	nop
    272      1.1  mrg 	nop
    273      1.1  mrg 	stx	%g2,[%fp+80]
    274      1.1  mrg 	fitod	%f0,%f4
    275      1.1  mrg C ---
    276      1.1  mrg 	nop
    277      1.1  mrg 	nop
    278      1.1  mrg 	lduw	[%i1+8],%g4
    279      1.1  mrg 	fitod	%f1,%f6
    280      1.1  mrg C ---
    281      1.1  mrg 	nop
    282      1.1  mrg 	nop
    283      1.1  mrg 	ldx	[%fp-24],%g2		C p16
    284      1.1  mrg 	fanop
    285      1.1  mrg C ---
    286      1.1  mrg 	nop
    287      1.1  mrg 	nop
    288      1.1  mrg 	ldx	[%fp-16],%g1		C p0
    289      1.1  mrg 	fmuld	%f2,%f4,%f4
    290      1.1  mrg C ---
    291      1.1  mrg 	sllx	%g2,16,%g2		C align p16
    292      1.1  mrg 	add	%i0,8,%i0		C res_ptr++
    293      1.1  mrg 	ld	[%i1],%f9
    294      1.1  mrg 	fmuld	%f2,%f6,%f6
    295      1.1  mrg C ---
    296      1.1  mrg 	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
    297      1.1  mrg 	add	%i1,4,%i1		C s1_ptr++
    298      1.1  mrg 	ldd	[%fp+72],%f0
    299      1.1  mrg 	fanop
    300      1.1  mrg C ---
    301      1.1  mrg 	srlx	%g1,32,%l0
    302      1.1  mrg 	nop
    303      1.1  mrg 	stw	%g1,[%i0-8]
    304      1.1  mrg 	fdtox	%f4,%f4
    305      1.1  mrg C ---
    306      1.1  mrg 	sllx	%g4,16,%g3		C 0000hhhhllll0000
    307      1.1  mrg 	nop
    308      1.1  mrg 	stw	%l0,[%i0-4]
    309      1.1  mrg 	fdtox	%f6,%f6
    310      1.1  mrg C ---
    311      1.1  mrg 	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
    312      1.1  mrg 	subcc	%i2,1,%i2
    313      1.1  mrg 	std	%f4,[%fp-24]
    314      1.1  mrg 	fxtod	%f8,%f2
    315      1.1  mrg C ---
    316      1.1  mrg 	std	%f6,[%fp-16]
    317      1.1  mrg 	andn	%g2,%g5,%g2		C 0000hhhh0000llll
    318      1.1  mrg 	be,pn	%icc,.Lend
    319      1.1  mrg 	fanop
    320      1.1  mrg C ---  LOOP MIDDLE
    321      1.1  mrg 	nop
    322      1.1  mrg 	nop
    323      1.1  mrg 	stx	%g2,[%fp+72]
    324      1.1  mrg 	fitod	%f0,%f4
    325      1.1  mrg C ---
    326      1.1  mrg 	nop
    327      1.1  mrg 	nop
    328      1.1  mrg 	lduw	[%i1+8],%g4
    329      1.1  mrg 	fitod	%f1,%f6
    330      1.1  mrg C ---
    331      1.1  mrg 	nop
    332      1.1  mrg 	nop
    333      1.1  mrg 	ldx	[%fp-40],%g2		C p16
    334      1.1  mrg 	fanop
    335      1.1  mrg C ---
    336      1.1  mrg 	nop
    337      1.1  mrg 	nop
    338      1.1  mrg 	ldx	[%fp-32],%g1		C p0
    339      1.1  mrg 	fmuld	%f2,%f4,%f4
    340      1.1  mrg C ---
    341      1.1  mrg 	sllx	%g2,16,%g2		C align p16
    342      1.1  mrg 	add	%i0,8,%i0		C res_ptr++
    343      1.1  mrg 	ld	[%i1],%f9
    344      1.1  mrg 	fmuld	%f2,%f6,%f6
    345      1.1  mrg C ---
    346      1.1  mrg 	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
    347      1.1  mrg 	add	%i1,4,%i1		C s1_ptr++
    348      1.1  mrg 	ldd	[%fp+80],%f0
    349      1.1  mrg 	fanop
    350      1.1  mrg C ---
    351      1.1  mrg 	srlx	%g1,32,%l0
    352      1.1  mrg 	nop
    353      1.1  mrg 	stw	%g1,[%i0-8]
    354      1.1  mrg 	fdtox	%f4,%f4
    355      1.1  mrg C ---
    356      1.1  mrg 	sllx	%g4,16,%g3		C 0000hhhhllll0000
    357      1.1  mrg 	nop
    358      1.1  mrg 	stw	%l0,[%i0-4]
    359      1.1  mrg 	fdtox	%f6,%f6
    360      1.1  mrg C ---
    361      1.1  mrg 	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
    362      1.1  mrg 	subcc	%i2,1,%i2
    363      1.1  mrg 	std	%f4,[%fp-40]
    364      1.1  mrg 	fxtod	%f8,%f2
    365      1.1  mrg C ---
    366      1.1  mrg 	std	%f6,[%fp-32]
    367      1.1  mrg 	andn	%g2,%g5,%g2		C 0000hhhh0000llll
    368      1.1  mrg 	bne,pt	%icc,.Loop
    369      1.1  mrg 	fanop
    370      1.1  mrg C --- LOOP END
    371      1.1  mrg 
    372      1.1  mrg .L5:	add	%fp, 80, %l3
    373      1.1  mrg 	add	%fp, -24, %l4
    374      1.1  mrg 	add	%fp, 72, %l5
    375      1.1  mrg 	b	.Ltail
    376      1.1  mrg 	add	%fp, -40, %l6
    377      1.1  mrg 
    378      1.1  mrg .Lend:	add	%fp, 72, %l3
    379      1.1  mrg 	add	%fp, -40, %l4
    380      1.1  mrg 	add	%fp, 80, %l5
    381      1.1  mrg 	add	%fp, -24, %l6
    382      1.1  mrg .Ltail:	stx	%g2,[%l3]
    383      1.1  mrg 	fitod	%f0,%f4
    384      1.1  mrg 	fitod	%f1,%f6
    385      1.1  mrg 	ldx	[%l4],%g2		C p16
    386      1.1  mrg 	ldx	[%l4+8],%g1		C p0
    387      1.1  mrg 	fmuld	%f2,%f4,%f4
    388      1.1  mrg 	sllx	%g2,16,%g2		C align p16
    389      1.1  mrg 	add	%i0,8,%i0		C res_ptr++
    390      1.1  mrg 	ld	[%i1],%f9
    391      1.1  mrg 	fmuld	%f2,%f6,%f6
    392      1.1  mrg 	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
    393      1.1  mrg 	add	%i1,4,%i1		C s1_ptr++
    394      1.1  mrg 	ldd	[%l5],%f0
    395      1.1  mrg 	srlx	%g1,32,%l0
    396      1.1  mrg 	stw	%g1,[%i0-8]
    397      1.1  mrg 	fdtox	%f4,%f4
    398      1.1  mrg 	stw	%l0,[%i0-4]
    399      1.1  mrg .L4:	fdtox	%f6,%f6
    400      1.1  mrg 	std	%f4,[%l4]
    401      1.1  mrg 	fxtod	%f8,%f2
    402      1.1  mrg 	std	%f6,[%l4+8]
    403      1.1  mrg 
    404      1.1  mrg 	fitod	%f0,%f4
    405      1.1  mrg 	fitod	%f1,%f6
    406      1.1  mrg 	ldx	[%l6],%g2		C p16
    407      1.1  mrg 	ldx	[%l6+8],%g1		C p0
    408      1.1  mrg 	fmuld	%f2,%f4,%f4
    409      1.1  mrg 	sllx	%g2,16,%g2		C align p16
    410      1.1  mrg 	add	%i0,8,%i0		C res_ptr++
    411      1.1  mrg 	ld	[%i1],%f9
    412      1.1  mrg 	fmuld	%f2,%f6,%f6
    413      1.1  mrg 	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
    414      1.1  mrg 	ldd	[%l3],%f0
    415      1.1  mrg 	srlx	%g1,32,%l0
    416      1.1  mrg 	stw	%g1,[%i0-8]
    417      1.1  mrg 	fdtox	%f4,%f4
    418      1.1  mrg 	stw	%l0,[%i0-4]
    419      1.1  mrg .L3:	fdtox	%f6,%f6
    420      1.1  mrg 	std	%f4,[%l6]
    421      1.1  mrg 	fxtod	%f8,%f2
    422      1.1  mrg 	std	%f6,[%l6+8]
    423      1.1  mrg 
    424      1.1  mrg 	fitod	%f0,%f4
    425      1.1  mrg 	fitod	%f1,%f6
    426      1.1  mrg 	ldx	[%l4],%g2		C p16
    427      1.1  mrg 	ldx	[%l4+8],%g1		C p0
    428      1.1  mrg 	fmuld	%f2,%f4,%f4
    429      1.1  mrg 	sllx	%g2,16,%g2		C align p16
    430      1.1  mrg 	add	%i0,8,%i0		C res_ptr++
    431      1.1  mrg 	fmuld	%f2,%f6,%f6
    432      1.1  mrg 	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
    433      1.1  mrg 	srlx	%g1,32,%l0
    434      1.1  mrg 	stw	%g1,[%i0-8]
    435      1.1  mrg 	fdtox	%f4,%f4
    436      1.1  mrg 	stw	%l0,[%i0-4]
    437      1.1  mrg .L2:	fdtox	%f6,%f6
    438      1.1  mrg 	std	%f4,[%l4]
    439      1.1  mrg 	std	%f6,[%l4+8]
    440      1.1  mrg 
    441      1.1  mrg 	ldx	[%l6],%g2		C p16
    442      1.1  mrg 	ldx	[%l6+8],%g1		C p0
    443      1.1  mrg 	sllx	%g2,16,%g2		C align p16
    444      1.1  mrg 	add	%i0,8,%i0		C res_ptr++
    445      1.1  mrg 	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
    446      1.1  mrg 	srlx	%g1,32,%l0
    447      1.1  mrg 	stw	%g1,[%i0-8]
    448      1.1  mrg 	stw	%l0,[%i0-4]
    449      1.1  mrg 
    450      1.1  mrg .L1:	ldx	[%l4],%g2		C p16
    451      1.1  mrg 	ldx	[%l4+8],%g1		C p0
    452      1.1  mrg 	sllx	%g2,16,%g2		C align p16
    453      1.1  mrg 	add	%i0,8,%i0		C res_ptr++
    454      1.1  mrg 	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
    455      1.1  mrg 	srlx	%g1,32,%l0
    456      1.1  mrg 	stw	%g1,[%i0-8]
    457      1.1  mrg 	stw	%l0,[%i0-4]
    458      1.1  mrg 
    459      1.1  mrg 	ret
    460      1.1  mrg 	restore	%g0,%g0,%o0
    461      1.1  mrg 
    462      1.1  mrg EPILOGUE(mpn_sqr_diagonal)
    463