Home | History | Annotate | Line # | Download | only in ia64
mul_2.asm revision 1.1.1.2
      1      1.1  mrg dnl  IA-64 mpn_mul_2 -- Multiply a n-limb number with a 2-limb number and store
      2      1.1  mrg dnl  store the result to a (n+1)-limb number.
      3      1.1  mrg 
      4  1.1.1.2  mrg dnl  Contributed to the GNU project by Torbjorn Granlund.
      5  1.1.1.2  mrg 
      6  1.1.1.2  mrg dnl  Copyright 2004, 2011 Free Software Foundation, Inc.
      7      1.1  mrg 
      8      1.1  mrg dnl  This file is part of the GNU MP Library.
      9      1.1  mrg 
     10      1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     11      1.1  mrg dnl  it under the terms of the GNU Lesser General Public License as published
     12      1.1  mrg dnl  by the Free Software Foundation; either version 3 of the License, or (at
     13      1.1  mrg dnl  your option) any later version.
     14      1.1  mrg 
     15      1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     16      1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     17      1.1  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     18      1.1  mrg dnl  License for more details.
     19      1.1  mrg 
     20      1.1  mrg dnl  You should have received a copy of the GNU Lesser General Public License
     21      1.1  mrg dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     22      1.1  mrg 
     23      1.1  mrg include(`../config.m4')
     24      1.1  mrg 
     25      1.1  mrg C         cycles/limb
     26  1.1.1.2  mrg C Itanium:    ?
     27  1.1.1.2  mrg C Itanium 2:  1.5
     28      1.1  mrg 
     29      1.1  mrg C TODO
     30      1.1  mrg C  * Clean up variable names, and try to decrease the number of distinct
     31      1.1  mrg C    registers used.
     32  1.1.1.2  mrg C  * Clean up feed-in code to not require zeroing several registers.
     33      1.1  mrg C  * Make sure we don't depend on uninitialized predicate registers.
     34      1.1  mrg C  * Could perhaps save a few cycles by using 1 c/l carry propagation in
     35      1.1  mrg C    wind-down code.
     36      1.1  mrg C  * Ultimately rewrite.  The problem with this code is that it first uses a
     37      1.1  mrg C    loaded u value in one xma pair, then leaves it live over several unrelated
     38      1.1  mrg C    xma pairs, before it uses it again.  It should actually be quite possible
     39      1.1  mrg C    to just swap some aligned xma pairs around.  But we should then schedule
     40      1.1  mrg C    u loads further from the first use.
     41      1.1  mrg 
     42      1.1  mrg C INPUT PARAMETERS
     43      1.1  mrg define(`rp',`r32')
     44      1.1  mrg define(`up',`r33')
     45      1.1  mrg define(`n',`r34')
     46      1.1  mrg define(`vp',`r35')
     47      1.1  mrg 
     48      1.1  mrg define(`srp',`r3')
     49      1.1  mrg 
     50      1.1  mrg define(`v0',`f6')
     51      1.1  mrg define(`v1',`f7')
     52      1.1  mrg 
     53      1.1  mrg define(`s0',`r14')
     54      1.1  mrg define(`acc0',`r15')
     55      1.1  mrg 
     56      1.1  mrg define(`pr0_0',`r16') define(`pr0_1',`r17')
     57      1.1  mrg define(`pr0_2',`r18') define(`pr0_3',`r19')
     58      1.1  mrg 
     59      1.1  mrg define(`pr1_0',`r20') define(`pr1_1',`r21')
     60      1.1  mrg define(`pr1_2',`r22') define(`pr1_3',`r23')
     61      1.1  mrg 
     62      1.1  mrg define(`acc1_0',`r24') define(`acc1_1',`r25')
     63      1.1  mrg define(`acc1_2',`r26') define(`acc1_3',`r27')
     64      1.1  mrg 
     65      1.1  mrg dnl define(`',`r28')
     66      1.1  mrg dnl define(`',`r29')
     67      1.1  mrg dnl define(`',`r30')
     68      1.1  mrg dnl define(`',`r31')
     69      1.1  mrg 
     70      1.1  mrg define(`fp0b_0',`f8') define(`fp0b_1',`f9')
     71      1.1  mrg define(`fp0b_2',`f10') define(`fp0b_3',`f11')
     72      1.1  mrg 
     73      1.1  mrg define(`fp1a_0',`f12') define(`fp1a_1',`f13')
     74      1.1  mrg define(`fp1a_2',`f14') define(`fp1a_3',`f15')
     75      1.1  mrg 
     76      1.1  mrg define(`fp1b_0',`f32') define(`fp1b_1',`f33')
     77      1.1  mrg define(`fp1b_2',`f34') define(`fp1b_3',`f35')
     78      1.1  mrg 
     79      1.1  mrg define(`fp2a_0',`f36') define(`fp2a_1',`f37')
     80      1.1  mrg define(`fp2a_2',`f38') define(`fp2a_3',`f39')
     81      1.1  mrg 
     82      1.1  mrg define(`u_0',`f44') define(`u_1',`f45')
     83      1.1  mrg define(`u_2',`f46') define(`u_3',`f47')
     84      1.1  mrg 
     85      1.1  mrg define(`ux',`f49')
     86      1.1  mrg define(`uy',`f51')
     87      1.1  mrg 
     88      1.1  mrg ASM_START()
     89      1.1  mrg PROLOGUE(mpn_mul_2)
     90      1.1  mrg 	.prologue
     91      1.1  mrg 	.save	ar.lc, r2
     92      1.1  mrg 	.body
     93      1.1  mrg 
     94  1.1.1.2  mrg ifdef(`HAVE_ABI_32',`
     95  1.1.1.2  mrg .mmi;		addp4	rp = 0, rp		C			M I
     96  1.1.1.2  mrg 		addp4	up = 0, up		C			M I
     97  1.1.1.2  mrg 		addp4	vp = 0, vp		C			M I
     98  1.1.1.2  mrg .mmi;		nop	1
     99  1.1.1.2  mrg 		nop	1
    100  1.1.1.2  mrg 		zxt4	n = n			C			I
    101      1.1  mrg 	;;')
    102      1.1  mrg 
    103  1.1.1.2  mrg .mmi;		ldf8	ux = [up], 8		C			M
    104  1.1.1.2  mrg 		ldf8	v0 = [vp], 8		C			M
    105  1.1.1.2  mrg 		mov	r2 = ar.lc		C			I0
    106  1.1.1.2  mrg .mmi;		nop	1			C			M
    107  1.1.1.2  mrg 		and	r14 = 3, n		C			M I
    108  1.1.1.2  mrg 		add	n = -2, n		C			M I
    109  1.1.1.2  mrg 	;;
    110  1.1.1.2  mrg .mmi;		ldf8	uy = [up], 8		C			M
    111  1.1.1.2  mrg 		ldf8	v1 = [vp]		C			M
    112  1.1.1.2  mrg 		shr.u	n = n, 2		C			I
    113  1.1.1.2  mrg .mmi;		nop	1			C			M
    114  1.1.1.2  mrg 		cmp.eq	p10, p0 = 1, r14	C			M I
    115  1.1.1.2  mrg 		cmp.eq	p11, p0 = 2, r14	C			M I
    116  1.1.1.2  mrg 	;;
    117  1.1.1.2  mrg .mmi;		nop	1			C			M
    118  1.1.1.2  mrg 		cmp.eq	p12, p0 = 3, r14	C			M I
    119  1.1.1.2  mrg 		mov	ar.lc = n		C			I0
    120  1.1.1.2  mrg .bbb;	(p10)	br.dptk	L(b01)			C			B
    121  1.1.1.2  mrg 	(p11)	br.dptk	L(b10)			C			B
    122  1.1.1.2  mrg 	(p12)	br.dptk	L(b11)			C			B
    123      1.1  mrg 	;;
    124      1.1  mrg 
    125      1.1  mrg 	ALIGN(32)
    126  1.1.1.2  mrg L(b00):		ldf8	u_1 = [up], 8
    127  1.1.1.2  mrg 		mov	acc1_2 = 0
    128  1.1.1.2  mrg 		mov	pr1_2 = 0
    129  1.1.1.2  mrg 		mov	pr0_3 = 0
    130  1.1.1.2  mrg 		cmp.ne	p8, p9 = r0, r0
    131  1.1.1.2  mrg 	;;
    132  1.1.1.2  mrg 		xma.l	fp0b_3 = ux, v0, f0
    133  1.1.1.2  mrg 		cmp.ne	p12, p13 = r0, r0
    134  1.1.1.2  mrg 		ldf8	u_2 = [up], 8
    135  1.1.1.2  mrg 		xma.hu	fp1a_3 = ux, v0, f0
    136  1.1.1.2  mrg 		br.cloop.dptk	L(gt4)
    137  1.1.1.2  mrg 
    138  1.1.1.2  mrg 		xma.l	fp0b_0 = uy, v0, f0
    139  1.1.1.2  mrg 		xma.hu	fp1a_0 = uy, v0, f0
    140  1.1.1.2  mrg 	;;
    141  1.1.1.2  mrg 		getfsig	acc0 = fp0b_3
    142  1.1.1.2  mrg 		xma.l	fp1b_3 = ux, v1, fp1a_3
    143  1.1.1.2  mrg 		xma.hu	fp2a_3 = ux, v1, fp1a_3
    144  1.1.1.2  mrg 	;;
    145  1.1.1.2  mrg 		xma.l	fp0b_1 = u_1, v0, f0
    146  1.1.1.2  mrg 		xma.hu	fp1a_1 = u_1, v0, f0
    147  1.1.1.2  mrg 	;;
    148  1.1.1.2  mrg 		getfsig	pr0_0 = fp0b_0
    149  1.1.1.2  mrg 		xma.l	fp1b_0 = uy, v1, fp1a_0
    150  1.1.1.2  mrg 		xma.hu	fp2a_0 = uy, v1, fp1a_0
    151  1.1.1.2  mrg 	;;
    152  1.1.1.2  mrg 		getfsig	pr1_3 = fp1b_3
    153  1.1.1.2  mrg 		getfsig	acc1_3 = fp2a_3
    154  1.1.1.2  mrg 		xma.l	fp0b_2 = u_2, v0, f0
    155  1.1.1.2  mrg 		xma.hu	fp1a_2 = u_2, v0, f0
    156  1.1.1.2  mrg 		br	L(cj4)
    157  1.1.1.2  mrg 
    158  1.1.1.2  mrg L(gt4):		xma.l	fp0b_0 = uy, v0, f0
    159  1.1.1.2  mrg 		xma.hu	fp1a_0 = uy, v0, f0
    160  1.1.1.2  mrg 	;;
    161  1.1.1.2  mrg 		getfsig	acc0 = fp0b_3
    162  1.1.1.2  mrg 		xma.l	fp1b_3 = ux, v1, fp1a_3
    163  1.1.1.2  mrg 		ldf8	u_3 = [up], 8
    164  1.1.1.2  mrg 		xma.hu	fp2a_3 = ux, v1, fp1a_3
    165  1.1.1.2  mrg 	;;
    166  1.1.1.2  mrg 		xma.l	fp0b_1 = u_1, v0, f0
    167  1.1.1.2  mrg 		xma.hu	fp1a_1 = u_1, v0, f0
    168  1.1.1.2  mrg 	;;
    169  1.1.1.2  mrg 		getfsig	pr0_0 = fp0b_0
    170  1.1.1.2  mrg 		xma.l	fp1b_0 = uy, v1, fp1a_0
    171  1.1.1.2  mrg 		xma.hu	fp2a_0 = uy, v1, fp1a_0
    172  1.1.1.2  mrg 	;;
    173  1.1.1.2  mrg 		ldf8	u_0 = [up], 8
    174  1.1.1.2  mrg 		getfsig	pr1_3 = fp1b_3
    175  1.1.1.2  mrg 		xma.l	fp0b_2 = u_2, v0, f0
    176  1.1.1.2  mrg 	;;
    177  1.1.1.2  mrg 		getfsig	acc1_3 = fp2a_3
    178  1.1.1.2  mrg 		xma.hu	fp1a_2 = u_2, v0, f0
    179  1.1.1.2  mrg 		br	L(00)
    180      1.1  mrg 
    181      1.1  mrg 
    182      1.1  mrg 	ALIGN(32)
    183  1.1.1.2  mrg L(b01):		ldf8	u_0 = [up], 8		C M
    184  1.1.1.2  mrg 		mov	acc1_1 = 0		C M I
    185  1.1.1.2  mrg 		mov	pr1_1 = 0		C M I
    186  1.1.1.2  mrg 		mov	pr0_2 = 0		C M I
    187  1.1.1.2  mrg 		cmp.ne	p6, p7 = r0, r0		C M I
    188  1.1.1.2  mrg 	;;
    189  1.1.1.2  mrg 		xma.l	fp0b_2 = ux, v0, f0	C F
    190  1.1.1.2  mrg 		cmp.ne	p10, p11 = r0, r0	C M I
    191  1.1.1.2  mrg 		ldf8	u_1 = [up], 8		C M
    192  1.1.1.2  mrg 		xma.hu	fp1a_2 = ux, v0, f0	C F
    193  1.1.1.2  mrg 	;;
    194  1.1.1.2  mrg 		xma.l	fp0b_3 = uy, v0, f0	C F
    195  1.1.1.2  mrg 		xma.hu	fp1a_3 = uy, v0, f0	C F
    196  1.1.1.2  mrg 	;;
    197  1.1.1.2  mrg 		getfsig	acc0 = fp0b_2		C M
    198  1.1.1.2  mrg 		xma.l	fp1b_2 = ux, v1,fp1a_2	C F
    199  1.1.1.2  mrg 		ldf8	u_2 = [up], 8		C M
    200  1.1.1.2  mrg 		xma.hu	fp2a_2 = ux, v1,fp1a_2	C F
    201  1.1.1.2  mrg 		br.cloop.dptk	L(gt5)
    202  1.1.1.2  mrg 
    203  1.1.1.2  mrg 		xma.l	fp0b_0 = u_0, v0, f0	C F
    204  1.1.1.2  mrg 		xma.hu	fp1a_0 = u_0, v0, f0	C F
    205  1.1.1.2  mrg 	;;
    206  1.1.1.2  mrg 		getfsig	pr0_3 = fp0b_3		C M
    207  1.1.1.2  mrg 		xma.l	fp1b_3 = uy, v1,fp1a_3	C F
    208  1.1.1.2  mrg 		xma.hu	fp2a_3 = uy, v1,fp1a_3	C F
    209  1.1.1.2  mrg 	;;
    210  1.1.1.2  mrg 		getfsig	pr1_2 = fp1b_2		C M
    211  1.1.1.2  mrg 		getfsig	acc1_2 = fp2a_2		C M
    212  1.1.1.2  mrg 		xma.l	fp0b_1 = u_1, v0, f0	C F
    213  1.1.1.2  mrg 		xma.hu	fp1a_1 = u_1, v0, f0	C F
    214  1.1.1.2  mrg 		br	L(cj5)
    215  1.1.1.2  mrg 
    216  1.1.1.2  mrg L(gt5):		xma.l	fp0b_0 = u_0, v0, f0
    217  1.1.1.2  mrg 		xma.hu	fp1a_0 = u_0, v0, f0
    218  1.1.1.2  mrg 	;;
    219  1.1.1.2  mrg 		getfsig	pr0_3 = fp0b_3
    220  1.1.1.2  mrg 		xma.l	fp1b_3 = uy, v1, fp1a_3
    221  1.1.1.2  mrg 		xma.hu	fp2a_3 = uy, v1, fp1a_3
    222  1.1.1.2  mrg 	;;
    223  1.1.1.2  mrg 		ldf8	u_3 = [up], 8
    224  1.1.1.2  mrg 		getfsig	pr1_2 = fp1b_2
    225  1.1.1.2  mrg 		xma.l	fp0b_1 = u_1, v0, f0
    226  1.1.1.2  mrg 	;;
    227  1.1.1.2  mrg 		getfsig	acc1_2 = fp2a_2
    228  1.1.1.2  mrg 		xma.hu	fp1a_1 = u_1, v0, f0
    229  1.1.1.2  mrg 		br	L(01)
    230      1.1  mrg 
    231      1.1  mrg 
    232      1.1  mrg 	ALIGN(32)
    233  1.1.1.2  mrg L(b10):		br.cloop.dptk	L(gt2)
    234  1.1.1.2  mrg 		xma.l	fp0b_1 = ux, v0, f0
    235  1.1.1.2  mrg 		xma.hu	fp1a_1 = ux, v0, f0
    236  1.1.1.2  mrg 	;;
    237  1.1.1.2  mrg 		xma.l	fp0b_2 = uy, v0, f0
    238  1.1.1.2  mrg 		xma.hu	fp1a_2 = uy, v0, f0
    239  1.1.1.2  mrg 	;;
    240  1.1.1.2  mrg 		stf8	[rp] = fp0b_1, 8
    241  1.1.1.2  mrg 		xma.l	fp1b_1 = ux, v1, fp1a_1
    242  1.1.1.2  mrg 		xma.hu	fp2a_1 = ux, v1, fp1a_1
    243  1.1.1.2  mrg 	;;
    244  1.1.1.2  mrg 		getfsig	acc0 = fp0b_2
    245  1.1.1.2  mrg 		xma.l	fp1b_2 = uy, v1, fp1a_2
    246  1.1.1.2  mrg 		xma.hu	fp2a_2 = uy, v1, fp1a_2
    247  1.1.1.2  mrg 	;;
    248  1.1.1.2  mrg 		getfsig	pr1_1 = fp1b_1
    249  1.1.1.2  mrg 		getfsig	acc1_1 = fp2a_1
    250  1.1.1.2  mrg 		mov	ar.lc = r2
    251  1.1.1.2  mrg 		getfsig	pr1_2 = fp1b_2
    252  1.1.1.2  mrg 		getfsig	r8 = fp2a_2
    253  1.1.1.2  mrg 	;;
    254  1.1.1.2  mrg 		add	s0 = pr1_1, acc0
    255  1.1.1.2  mrg 	;;
    256  1.1.1.2  mrg 		st8	[rp] = s0, 8
    257  1.1.1.2  mrg 		cmp.ltu	p8, p9 = s0, pr1_1
    258  1.1.1.2  mrg 		sub	r31 = -1, acc1_1
    259  1.1.1.2  mrg 	;;
    260  1.1.1.2  mrg 		.pred.rel "mutex", p8, p9
    261  1.1.1.2  mrg 	(p8)	add	acc0 = pr1_2, acc1_1, 1
    262  1.1.1.2  mrg 	(p9)	add	acc0 = pr1_2, acc1_1
    263  1.1.1.2  mrg 	(p8)	cmp.leu	p10, p0 = r31, pr1_2
    264  1.1.1.2  mrg 	(p9)	cmp.ltu	p10, p0 = r31, pr1_2
    265  1.1.1.2  mrg 	;;
    266  1.1.1.2  mrg 		st8	[rp] = acc0, 8
    267  1.1.1.2  mrg 	(p10)	add	r8 = 1, r8
    268  1.1.1.2  mrg 		br.ret.sptk.many b0
    269  1.1.1.2  mrg 
    270  1.1.1.2  mrg L(gt2):		ldf8	u_3 = [up], 8
    271  1.1.1.2  mrg 		mov	acc1_0 = 0
    272  1.1.1.2  mrg 		mov	pr1_0 = 0
    273  1.1.1.2  mrg 	;;
    274  1.1.1.2  mrg 		mov	pr0_1 = 0
    275  1.1.1.2  mrg 		xma.l	fp0b_1 = ux, v0, f0
    276  1.1.1.2  mrg 		ldf8	u_0 = [up], 8
    277  1.1.1.2  mrg 		xma.hu	fp1a_1 = ux, v0, f0
    278  1.1.1.2  mrg 	;;
    279  1.1.1.2  mrg 		xma.l	fp0b_2 = uy, v0, f0
    280  1.1.1.2  mrg 		xma.hu	fp1a_2 = uy, v0, f0
    281  1.1.1.2  mrg 	;;
    282  1.1.1.2  mrg 		getfsig	acc0 = fp0b_1
    283  1.1.1.2  mrg 		xma.l	fp1b_1 = ux, v1, fp1a_1
    284  1.1.1.2  mrg 		xma.hu	fp2a_1 = ux, v1, fp1a_1
    285  1.1.1.2  mrg 	;;
    286  1.1.1.2  mrg 		ldf8	u_1 = [up], 8
    287  1.1.1.2  mrg 		xma.l	fp0b_3 = u_3, v0, f0
    288  1.1.1.2  mrg 		xma.hu	fp1a_3 = u_3, v0, f0
    289  1.1.1.2  mrg 	;;
    290  1.1.1.2  mrg 		getfsig	pr0_2 = fp0b_2
    291  1.1.1.2  mrg 		xma.l	fp1b_2 = uy, v1, fp1a_2
    292  1.1.1.2  mrg 		xma.hu	fp2a_2 = uy, v1, fp1a_2
    293  1.1.1.2  mrg 	;;
    294  1.1.1.2  mrg 		ldf8	u_2 = [up], 8
    295  1.1.1.2  mrg 		getfsig	pr1_1 = fp1b_1
    296  1.1.1.2  mrg 	;;
    297  1.1.1.2  mrg .mfi;		getfsig	acc1_1 = fp2a_1
    298  1.1.1.2  mrg 		xma.l	fp0b_0 = u_0, v0, f0
    299  1.1.1.2  mrg 		cmp.ne	p8, p9 = r0, r0
    300  1.1.1.2  mrg .mfb;		cmp.ne	p12, p13 = r0, r0
    301  1.1.1.2  mrg 		xma.hu	fp1a_0 = u_0, v0, f0
    302  1.1.1.2  mrg 		br	L(10)
    303      1.1  mrg 
    304      1.1  mrg 
    305      1.1  mrg 	ALIGN(32)
    306  1.1.1.2  mrg L(b11):		mov	acc1_3 = 0
    307  1.1.1.2  mrg 		mov	pr1_3 = 0
    308  1.1.1.2  mrg 		mov	pr0_0 = 0
    309  1.1.1.2  mrg 		ldf8	u_2 = [up], 8
    310  1.1.1.2  mrg 		cmp.ne	p6, p7 = r0, r0
    311  1.1.1.2  mrg 		br.cloop.dptk	L(gt3)
    312  1.1.1.2  mrg 	;;
    313  1.1.1.2  mrg 		xma.l	fp0b_0 = ux, v0, f0
    314  1.1.1.2  mrg 		xma.hu	fp1a_0 = ux, v0, f0
    315  1.1.1.2  mrg 	;;
    316  1.1.1.2  mrg 		cmp.ne	p10, p11 = r0, r0
    317  1.1.1.2  mrg 		xma.l	fp0b_1 = uy, v0, f0
    318  1.1.1.2  mrg 		xma.hu	fp1a_1 = uy, v0, f0
    319  1.1.1.2  mrg 	;;
    320  1.1.1.2  mrg 		getfsig	acc0 = fp0b_0
    321  1.1.1.2  mrg 		xma.l	fp1b_0 = ux, v1, fp1a_0
    322  1.1.1.2  mrg 		xma.hu	fp2a_0 = ux, v1, fp1a_0
    323  1.1.1.2  mrg 	;;
    324  1.1.1.2  mrg 		xma.l	fp0b_2 = u_2, v0, f0
    325  1.1.1.2  mrg 		xma.hu	fp1a_2 = u_2, v0, f0
    326  1.1.1.2  mrg 	;;
    327  1.1.1.2  mrg 		getfsig	pr0_1 = fp0b_1
    328  1.1.1.2  mrg 		xma.l	fp1b_1 = uy, v1, fp1a_1
    329  1.1.1.2  mrg 		xma.hu	fp2a_1 = uy, v1, fp1a_1
    330  1.1.1.2  mrg 	;;
    331  1.1.1.2  mrg 		getfsig	pr1_0 = fp1b_0
    332  1.1.1.2  mrg 		getfsig	acc1_0 = fp2a_0
    333  1.1.1.2  mrg 		br	L(cj3)
    334  1.1.1.2  mrg 
    335  1.1.1.2  mrg L(gt3):		xma.l	fp0b_0 = ux, v0, f0
    336  1.1.1.2  mrg 		cmp.ne	p10, p11 = r0, r0
    337  1.1.1.2  mrg 		ldf8	u_3 = [up], 8
    338  1.1.1.2  mrg 		xma.hu	fp1a_0 = ux, v0, f0
    339  1.1.1.2  mrg 	;;
    340  1.1.1.2  mrg 		xma.l	fp0b_1 = uy, v0, f0
    341  1.1.1.2  mrg 		xma.hu	fp1a_1 = uy, v0, f0
    342  1.1.1.2  mrg 	;;
    343  1.1.1.2  mrg 		getfsig	acc0 = fp0b_0
    344  1.1.1.2  mrg 		xma.l	fp1b_0 = ux, v1, fp1a_0
    345  1.1.1.2  mrg 		ldf8	u_0 = [up], 8
    346  1.1.1.2  mrg 		xma.hu	fp2a_0 = ux, v1, fp1a_0
    347  1.1.1.2  mrg 	;;
    348  1.1.1.2  mrg 		xma.l	fp0b_2 = u_2, v0, f0
    349  1.1.1.2  mrg 		xma.hu	fp1a_2 = u_2, v0, f0
    350  1.1.1.2  mrg 	;;
    351  1.1.1.2  mrg 		getfsig	pr0_1 = fp0b_1
    352  1.1.1.2  mrg 		xma.l	fp1b_1 = uy, v1, fp1a_1
    353  1.1.1.2  mrg 		xma.hu	fp2a_1 = uy, v1, fp1a_1
    354  1.1.1.2  mrg 	;;
    355  1.1.1.2  mrg 		ldf8	u_1 = [up], 8
    356  1.1.1.2  mrg 		getfsig	pr1_0 = fp1b_0
    357  1.1.1.2  mrg 	;;
    358  1.1.1.2  mrg 		getfsig	acc1_0 = fp2a_0
    359  1.1.1.2  mrg 		xma.l	fp0b_3 = u_3, v0, f0
    360  1.1.1.2  mrg 		xma.hu	fp1a_3 = u_3, v0, f0
    361  1.1.1.2  mrg 		br	L(11)
    362      1.1  mrg 
    363      1.1  mrg 
    364      1.1  mrg C *** MAIN LOOP START ***
    365      1.1  mrg 	ALIGN(32)
    366  1.1.1.2  mrg L(top):						C 00
    367  1.1.1.2  mrg 		.pred.rel "mutex", p8, p9
    368  1.1.1.2  mrg 		.pred.rel "mutex", p12, p13
    369  1.1.1.2  mrg 		ldf8	u_3 = [up], 8
    370  1.1.1.2  mrg 		getfsig	pr1_2 = fp1b_2
    371  1.1.1.2  mrg 	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
    372  1.1.1.2  mrg 	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
    373  1.1.1.2  mrg 	(p12)	cmp.leu	p10, p11 = s0, pr1_0
    374  1.1.1.2  mrg 	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
    375      1.1  mrg 	;;					C 01
    376  1.1.1.2  mrg 		.pred.rel "mutex", p6, p7
    377  1.1.1.2  mrg 		getfsig	acc1_2 = fp2a_2
    378  1.1.1.2  mrg 		st8	[rp] = s0, 8
    379  1.1.1.2  mrg 		xma.l	fp0b_1 = u_1, v0, f0
    380  1.1.1.2  mrg 	(p6)	add	acc0 = pr0_2, acc1_0, 1
    381  1.1.1.2  mrg 	(p7)	add	acc0 = pr0_2, acc1_0
    382  1.1.1.2  mrg 		xma.hu	fp1a_1 = u_1, v0, f0
    383      1.1  mrg 	;;					C 02
    384  1.1.1.2  mrg L(01):
    385  1.1.1.2  mrg 		.pred.rel "mutex", p10, p11
    386  1.1.1.2  mrg 		getfsig	pr0_0 = fp0b_0
    387  1.1.1.2  mrg 		xma.l	fp1b_0 = u_0, v1, fp1a_0
    388  1.1.1.2  mrg 	(p10)	add	s0 = pr1_1, acc0, 1
    389  1.1.1.2  mrg 	(p11)	add	s0 = pr1_1, acc0
    390  1.1.1.2  mrg 		xma.hu	fp2a_0 = u_0, v1, fp1a_0
    391  1.1.1.2  mrg 		nop	1
    392      1.1  mrg 	;;					C 03
    393  1.1.1.2  mrg 		.pred.rel "mutex", p6, p7
    394  1.1.1.2  mrg 		.pred.rel "mutex", p10, p11
    395  1.1.1.2  mrg 		ldf8	u_0 = [up], 8
    396  1.1.1.2  mrg 		getfsig	pr1_3 = fp1b_3
    397  1.1.1.2  mrg 	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
    398  1.1.1.2  mrg 	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
    399  1.1.1.2  mrg 	(p10)	cmp.leu	p12, p13 = s0, pr1_1
    400  1.1.1.2  mrg 	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
    401      1.1  mrg 	;;					C 04
    402  1.1.1.2  mrg 		.pred.rel "mutex", p8, p9
    403  1.1.1.2  mrg 		getfsig	acc1_3 = fp2a_3
    404  1.1.1.2  mrg 		st8	[rp] = s0, 8
    405  1.1.1.2  mrg 		xma.l	fp0b_2 = u_2, v0, f0
    406  1.1.1.2  mrg 	(p8)	add	acc0 = pr0_3, acc1_1, 1
    407  1.1.1.2  mrg 	(p9)	add	acc0 = pr0_3, acc1_1
    408  1.1.1.2  mrg 		xma.hu	fp1a_2 = u_2, v0, f0
    409      1.1  mrg 	;;					C 05
    410  1.1.1.2  mrg L(00):
    411  1.1.1.2  mrg 		.pred.rel "mutex", p12, p13
    412  1.1.1.2  mrg 		getfsig	pr0_1 = fp0b_1
    413  1.1.1.2  mrg 		xma.l	fp1b_1 = u_1, v1, fp1a_1
    414  1.1.1.2  mrg 	(p12)	add	s0 = pr1_2, acc0, 1
    415  1.1.1.2  mrg 	(p13)	add	s0 = pr1_2, acc0
    416  1.1.1.2  mrg 		xma.hu	fp2a_1 = u_1, v1, fp1a_1
    417  1.1.1.2  mrg 		nop	1
    418      1.1  mrg 	;;					C 06
    419  1.1.1.2  mrg 		.pred.rel "mutex", p8, p9
    420  1.1.1.2  mrg 		.pred.rel "mutex", p12, p13
    421  1.1.1.2  mrg 		ldf8	u_1 = [up], 8
    422  1.1.1.2  mrg 		getfsig	pr1_0 = fp1b_0
    423  1.1.1.2  mrg 	(p8)	cmp.leu	p6, p7 = acc0, pr0_3
    424  1.1.1.2  mrg 	(p9)	cmp.ltu	p6, p7 = acc0, pr0_3
    425  1.1.1.2  mrg 	(p12)	cmp.leu	p10, p11 = s0, pr1_2
    426  1.1.1.2  mrg 	(p13)	cmp.ltu	p10, p11 = s0, pr1_2
    427      1.1  mrg 	;;					C 07
    428  1.1.1.2  mrg 		.pred.rel "mutex", p6, p7
    429  1.1.1.2  mrg 		getfsig	acc1_0 = fp2a_0
    430  1.1.1.2  mrg 		st8	[rp] = s0, 8
    431  1.1.1.2  mrg 		xma.l	fp0b_3 = u_3, v0, f0
    432  1.1.1.2  mrg 	(p6)	add	acc0 = pr0_0, acc1_2, 1
    433  1.1.1.2  mrg 	(p7)	add	acc0 = pr0_0, acc1_2
    434  1.1.1.2  mrg 		xma.hu	fp1a_3 = u_3, v0, f0
    435      1.1  mrg 	;;					C 08
    436  1.1.1.2  mrg L(11):
    437  1.1.1.2  mrg 		.pred.rel "mutex", p10, p11
    438  1.1.1.2  mrg 		getfsig	pr0_2 = fp0b_2
    439  1.1.1.2  mrg 		xma.l	fp1b_2 = u_2, v1, fp1a_2
    440  1.1.1.2  mrg 	(p10)	add	s0 = pr1_3, acc0, 1
    441  1.1.1.2  mrg 	(p11)	add	s0 = pr1_3, acc0
    442  1.1.1.2  mrg 		xma.hu	fp2a_2 = u_2, v1, fp1a_2
    443  1.1.1.2  mrg 		nop	1
    444      1.1  mrg 	;;					C 09
    445  1.1.1.2  mrg 		.pred.rel "mutex", p6, p7
    446  1.1.1.2  mrg 		.pred.rel "mutex", p10, p11
    447  1.1.1.2  mrg 		ldf8	u_2 = [up], 8
    448  1.1.1.2  mrg 		getfsig	pr1_1 = fp1b_1
    449  1.1.1.2  mrg 	(p6)	cmp.leu	p8, p9 = acc0, pr0_0
    450  1.1.1.2  mrg 	(p7)	cmp.ltu	p8, p9 = acc0, pr0_0
    451  1.1.1.2  mrg 	(p10)	cmp.leu	p12, p13 = s0, pr1_3
    452  1.1.1.2  mrg 	(p11)	cmp.ltu	p12, p13 = s0, pr1_3
    453      1.1  mrg 	;;					C 10
    454  1.1.1.2  mrg 		.pred.rel "mutex", p8, p9
    455  1.1.1.2  mrg 		getfsig	acc1_1 = fp2a_1
    456  1.1.1.2  mrg 		st8	[rp] = s0, 8
    457  1.1.1.2  mrg 		xma.l	fp0b_0 = u_0, v0, f0
    458  1.1.1.2  mrg 	(p8)	add	acc0 = pr0_1, acc1_3, 1
    459  1.1.1.2  mrg 	(p9)	add	acc0 = pr0_1, acc1_3
    460  1.1.1.2  mrg 		xma.hu	fp1a_0 = u_0, v0, f0
    461      1.1  mrg 	;;					C 11
    462  1.1.1.2  mrg L(10):
    463  1.1.1.2  mrg 		.pred.rel "mutex", p12, p13
    464  1.1.1.2  mrg 		getfsig	pr0_3 = fp0b_3
    465  1.1.1.2  mrg 		xma.l	fp1b_3 = u_3, v1, fp1a_3
    466  1.1.1.2  mrg 	(p12)	add	s0 = pr1_0, acc0, 1
    467  1.1.1.2  mrg 	(p13)	add	s0 = pr1_0, acc0
    468  1.1.1.2  mrg 		xma.hu	fp2a_3 = u_3, v1, fp1a_3
    469  1.1.1.2  mrg 		br.cloop.dptk	L(top)
    470      1.1  mrg 	;;
    471      1.1  mrg C *** MAIN LOOP END ***
    472      1.1  mrg 
    473  1.1.1.2  mrg 		.pred.rel "mutex", p8, p9
    474  1.1.1.2  mrg 		.pred.rel "mutex", p12, p13
    475  1.1.1.2  mrg .mmi;		getfsig	pr1_2 = fp1b_2
    476  1.1.1.2  mrg 		st8	[rp] = s0, 8
    477  1.1.1.2  mrg 	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
    478  1.1.1.2  mrg .mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
    479  1.1.1.2  mrg 	(p12)	cmp.leu	p10, p11 = s0, pr1_0
    480  1.1.1.2  mrg 	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
    481  1.1.1.2  mrg 	;;
    482  1.1.1.2  mrg 		.pred.rel "mutex", p6, p7
    483  1.1.1.2  mrg .mfi;		getfsig	acc1_2 = fp2a_2
    484  1.1.1.2  mrg 		xma.l	fp0b_1 = u_1, v0, f0
    485  1.1.1.2  mrg 		nop	1
    486  1.1.1.2  mrg .mmf;	(p6)	add	acc0 = pr0_2, acc1_0, 1
    487  1.1.1.2  mrg 	(p7)	add	acc0 = pr0_2, acc1_0
    488  1.1.1.2  mrg 		xma.hu	fp1a_1 = u_1, v0, f0
    489  1.1.1.2  mrg 	;;
    490  1.1.1.2  mrg L(cj5):
    491  1.1.1.2  mrg 		.pred.rel "mutex", p10, p11
    492  1.1.1.2  mrg .mfi;		getfsig	pr0_0 = fp0b_0
    493  1.1.1.2  mrg 		xma.l	fp1b_0 = u_0, v1, fp1a_0
    494  1.1.1.2  mrg 	(p10)	add	s0 = pr1_1, acc0, 1
    495  1.1.1.2  mrg .mfi;	(p11)	add	s0 = pr1_1, acc0
    496  1.1.1.2  mrg 		xma.hu	fp2a_0 = u_0, v1, fp1a_0
    497  1.1.1.2  mrg 		nop	1
    498  1.1.1.2  mrg 	;;
    499  1.1.1.2  mrg 		.pred.rel "mutex", p6, p7
    500  1.1.1.2  mrg 		.pred.rel "mutex", p10, p11
    501  1.1.1.2  mrg .mmi;		getfsig	pr1_3 = fp1b_3
    502  1.1.1.2  mrg 		st8	[rp] = s0, 8
    503  1.1.1.2  mrg 	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
    504  1.1.1.2  mrg .mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
    505  1.1.1.2  mrg 	(p10)	cmp.leu	p12, p13 = s0, pr1_1
    506  1.1.1.2  mrg 	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
    507  1.1.1.2  mrg 	;;
    508  1.1.1.2  mrg 		.pred.rel "mutex", p8, p9
    509  1.1.1.2  mrg .mfi;		getfsig	acc1_3 = fp2a_3
    510  1.1.1.2  mrg 		xma.l	fp0b_2 = u_2, v0, f0
    511  1.1.1.2  mrg 		nop	1
    512  1.1.1.2  mrg .mmf;	(p8)	add	acc0 = pr0_3, acc1_1, 1
    513  1.1.1.2  mrg 	(p9)	add	acc0 = pr0_3, acc1_1
    514  1.1.1.2  mrg 		xma.hu	fp1a_2 = u_2, v0, f0
    515  1.1.1.2  mrg 	;;
    516  1.1.1.2  mrg L(cj4):
    517  1.1.1.2  mrg 		.pred.rel "mutex", p12, p13
    518  1.1.1.2  mrg .mfi;		getfsig	pr0_1 = fp0b_1
    519  1.1.1.2  mrg 		xma.l	fp1b_1 = u_1, v1, fp1a_1
    520  1.1.1.2  mrg 	(p12)	add	s0 = pr1_2, acc0, 1
    521  1.1.1.2  mrg .mfi;	(p13)	add	s0 = pr1_2, acc0
    522  1.1.1.2  mrg 		xma.hu	fp2a_1 = u_1, v1, fp1a_1
    523  1.1.1.2  mrg 		nop	1
    524  1.1.1.2  mrg 	;;
    525  1.1.1.2  mrg 		.pred.rel "mutex", p8, p9
    526  1.1.1.2  mrg 		.pred.rel "mutex", p12, p13
    527  1.1.1.2  mrg .mmi;		getfsig	pr1_0 = fp1b_0
    528  1.1.1.2  mrg 		st8	[rp] = s0, 8
    529  1.1.1.2  mrg 	(p8)	cmp.leu	p6, p7 = acc0, pr0_3
    530  1.1.1.2  mrg .mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_3
    531  1.1.1.2  mrg 	(p12)	cmp.leu	p10, p11 = s0, pr1_2
    532  1.1.1.2  mrg 	(p13)	cmp.ltu	p10, p11 = s0, pr1_2
    533  1.1.1.2  mrg 	;;
    534  1.1.1.2  mrg 		.pred.rel "mutex", p6, p7
    535  1.1.1.2  mrg .mmi;		getfsig	acc1_0 = fp2a_0
    536  1.1.1.2  mrg 	(p6)	add	acc0 = pr0_0, acc1_2, 1
    537  1.1.1.2  mrg 	(p7)	add	acc0 = pr0_0, acc1_2
    538  1.1.1.2  mrg 	;;
    539  1.1.1.2  mrg L(cj3):
    540  1.1.1.2  mrg 		.pred.rel "mutex", p10, p11
    541  1.1.1.2  mrg .mfi;		getfsig	pr0_2 = fp0b_2
    542  1.1.1.2  mrg 		xma.l	fp1b_2 = u_2, v1, fp1a_2
    543  1.1.1.2  mrg 	(p10)	add	s0 = pr1_3, acc0, 1
    544  1.1.1.2  mrg .mfi;	(p11)	add	s0 = pr1_3, acc0
    545  1.1.1.2  mrg 		xma.hu	fp2a_2 = u_2, v1, fp1a_2
    546  1.1.1.2  mrg 		nop	1
    547  1.1.1.2  mrg 	;;
    548  1.1.1.2  mrg 		.pred.rel "mutex", p6, p7
    549  1.1.1.2  mrg 		.pred.rel "mutex", p10, p11
    550  1.1.1.2  mrg .mmi;		getfsig	pr1_1 = fp1b_1
    551  1.1.1.2  mrg 		st8	[rp] = s0, 8
    552  1.1.1.2  mrg 	(p6)	cmp.leu	p8, p9 = acc0, pr0_0
    553  1.1.1.2  mrg .mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_0
    554  1.1.1.2  mrg 	(p10)	cmp.leu	p12, p13 = s0, pr1_3
    555  1.1.1.2  mrg 	(p11)	cmp.ltu	p12, p13 = s0, pr1_3
    556  1.1.1.2  mrg 	;;
    557  1.1.1.2  mrg 		.pred.rel "mutex", p8, p9
    558  1.1.1.2  mrg .mmi;		getfsig	acc1_1 = fp2a_1
    559  1.1.1.2  mrg 	(p8)	add	acc0 = pr0_1, acc1_3, 1
    560  1.1.1.2  mrg 	(p9)	add	acc0 = pr0_1, acc1_3
    561  1.1.1.2  mrg 	;;
    562  1.1.1.2  mrg 		.pred.rel "mutex", p12, p13
    563  1.1.1.2  mrg .mmi;	(p12)	add	s0 = pr1_0, acc0, 1
    564  1.1.1.2  mrg 	(p13)	add	s0 = pr1_0, acc0
    565  1.1.1.2  mrg 		nop	1
    566  1.1.1.2  mrg 	;;
    567  1.1.1.2  mrg 		.pred.rel "mutex", p8, p9
    568  1.1.1.2  mrg 		.pred.rel "mutex", p12, p13
    569  1.1.1.2  mrg .mmi;		getfsig	pr1_2 = fp1b_2
    570  1.1.1.2  mrg 		st8	[rp] = s0, 8
    571  1.1.1.2  mrg 	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
    572  1.1.1.2  mrg .mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
    573  1.1.1.2  mrg 	(p12)	cmp.leu	p10, p11 = s0, pr1_0
    574  1.1.1.2  mrg 	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
    575  1.1.1.2  mrg 	;;
    576  1.1.1.2  mrg 		.pred.rel "mutex", p6, p7
    577  1.1.1.2  mrg .mmi;		getfsig	r8 = fp2a_2
    578  1.1.1.2  mrg 	(p6)	add	acc0 = pr0_2, acc1_0, 1
    579  1.1.1.2  mrg 	(p7)	add	acc0 = pr0_2, acc1_0
    580  1.1.1.2  mrg 	;;
    581  1.1.1.2  mrg 		.pred.rel "mutex", p10, p11
    582  1.1.1.2  mrg .mmi;	(p10)	add	s0 = pr1_1, acc0, 1
    583  1.1.1.2  mrg 	(p11)	add	s0 = pr1_1, acc0
    584  1.1.1.2  mrg 	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
    585  1.1.1.2  mrg 	;;
    586  1.1.1.2  mrg 		.pred.rel "mutex", p10, p11
    587  1.1.1.2  mrg .mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
    588  1.1.1.2  mrg 	(p10)	cmp.leu	p12, p13 = s0, pr1_1
    589  1.1.1.2  mrg 	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
    590  1.1.1.2  mrg 	;;
    591  1.1.1.2  mrg 		.pred.rel "mutex", p8, p9
    592  1.1.1.2  mrg .mmi;		st8	[rp] = s0, 8
    593  1.1.1.2  mrg 	(p8)	add	acc0 = pr1_2, acc1_1, 1
    594  1.1.1.2  mrg 	(p9)	add	acc0 = pr1_2, acc1_1
    595  1.1.1.2  mrg 	;;
    596  1.1.1.2  mrg 		.pred.rel "mutex", p8, p9
    597  1.1.1.2  mrg .mmi;	(p8)	cmp.leu	p10, p11 = acc0, pr1_2
    598  1.1.1.2  mrg 	(p9)	cmp.ltu	p10, p11 = acc0, pr1_2
    599  1.1.1.2  mrg 	(p12)	add	acc0 = 1, acc0
    600  1.1.1.2  mrg 	;;
    601  1.1.1.2  mrg .mmi;		st8	[rp] = acc0, 8
    602  1.1.1.2  mrg 	(p12)	cmpeqor	p10, p0 = 0, acc0
    603  1.1.1.2  mrg 		nop	1
    604  1.1.1.2  mrg 	;;
    605  1.1.1.2  mrg .mib;	(p10)	add	r8 = 1, r8
    606  1.1.1.2  mrg 		mov	ar.lc = r2
    607  1.1.1.2  mrg 		br.ret.sptk.many b0
    608      1.1  mrg EPILOGUE()
    609      1.1  mrg ASM_END()
    610