Home | History | Annotate | Line # | Download | only in ia64
      1      1.1  mrg dnl  IA-64 mpn_mul_2 -- Multiply a n-limb number with a 2-limb number and store
      2      1.1  mrg dnl  store the result to a (n+1)-limb number.
      3      1.1  mrg 
      4  1.1.1.2  mrg dnl  Contributed to the GNU project by Torbjorn Granlund.
      5  1.1.1.2  mrg 
      6  1.1.1.2  mrg dnl  Copyright 2004, 2011 Free Software Foundation, Inc.
      7      1.1  mrg 
      8      1.1  mrg dnl  This file is part of the GNU MP Library.
      9  1.1.1.3  mrg dnl
     10      1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     11  1.1.1.3  mrg dnl  it under the terms of either:
     12  1.1.1.3  mrg dnl
     13  1.1.1.3  mrg dnl    * the GNU Lesser General Public License as published by the Free
     14  1.1.1.3  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     15  1.1.1.3  mrg dnl      option) any later version.
     16  1.1.1.3  mrg dnl
     17  1.1.1.3  mrg dnl  or
     18  1.1.1.3  mrg dnl
     19  1.1.1.3  mrg dnl    * the GNU General Public License as published by the Free Software
     20  1.1.1.3  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     21  1.1.1.3  mrg dnl      later version.
     22  1.1.1.3  mrg dnl
     23  1.1.1.3  mrg dnl  or both in parallel, as here.
     24  1.1.1.3  mrg dnl
     25      1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     26      1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     27  1.1.1.3  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     28  1.1.1.3  mrg dnl  for more details.
     29  1.1.1.3  mrg dnl
     30  1.1.1.3  mrg dnl  You should have received copies of the GNU General Public License and the
     31  1.1.1.3  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     32  1.1.1.3  mrg dnl  see https://www.gnu.org/licenses/.
     33      1.1  mrg 
     34      1.1  mrg include(`../config.m4')
     35      1.1  mrg 
     36      1.1  mrg C         cycles/limb
     37  1.1.1.2  mrg C Itanium:    ?
     38  1.1.1.2  mrg C Itanium 2:  1.5
     39      1.1  mrg 
     40      1.1  mrg C TODO
     41      1.1  mrg C  * Clean up variable names, and try to decrease the number of distinct
     42      1.1  mrg C    registers used.
     43  1.1.1.2  mrg C  * Clean up feed-in code to not require zeroing several registers.
     44      1.1  mrg C  * Make sure we don't depend on uninitialized predicate registers.
     45      1.1  mrg C  * Could perhaps save a few cycles by using 1 c/l carry propagation in
     46      1.1  mrg C    wind-down code.
     47      1.1  mrg C  * Ultimately rewrite.  The problem with this code is that it first uses a
     48      1.1  mrg C    loaded u value in one xma pair, then leaves it live over several unrelated
     49      1.1  mrg C    xma pairs, before it uses it again.  It should actually be quite possible
     50      1.1  mrg C    to just swap some aligned xma pairs around.  But we should then schedule
     51      1.1  mrg C    u loads further from the first use.
     52      1.1  mrg 
     53      1.1  mrg C INPUT PARAMETERS
     54      1.1  mrg define(`rp',`r32')
     55      1.1  mrg define(`up',`r33')
     56      1.1  mrg define(`n',`r34')
     57      1.1  mrg define(`vp',`r35')
     58      1.1  mrg 
     59      1.1  mrg define(`srp',`r3')
     60      1.1  mrg 
     61      1.1  mrg define(`v0',`f6')
     62      1.1  mrg define(`v1',`f7')
     63      1.1  mrg 
     64      1.1  mrg define(`s0',`r14')
     65      1.1  mrg define(`acc0',`r15')
     66      1.1  mrg 
     67      1.1  mrg define(`pr0_0',`r16') define(`pr0_1',`r17')
     68      1.1  mrg define(`pr0_2',`r18') define(`pr0_3',`r19')
     69      1.1  mrg 
     70      1.1  mrg define(`pr1_0',`r20') define(`pr1_1',`r21')
     71      1.1  mrg define(`pr1_2',`r22') define(`pr1_3',`r23')
     72      1.1  mrg 
     73      1.1  mrg define(`acc1_0',`r24') define(`acc1_1',`r25')
     74      1.1  mrg define(`acc1_2',`r26') define(`acc1_3',`r27')
     75      1.1  mrg 
     76      1.1  mrg dnl define(`',`r28')
     77      1.1  mrg dnl define(`',`r29')
     78      1.1  mrg dnl define(`',`r30')
     79      1.1  mrg dnl define(`',`r31')
     80      1.1  mrg 
     81      1.1  mrg define(`fp0b_0',`f8') define(`fp0b_1',`f9')
     82      1.1  mrg define(`fp0b_2',`f10') define(`fp0b_3',`f11')
     83      1.1  mrg 
     84      1.1  mrg define(`fp1a_0',`f12') define(`fp1a_1',`f13')
     85      1.1  mrg define(`fp1a_2',`f14') define(`fp1a_3',`f15')
     86      1.1  mrg 
     87      1.1  mrg define(`fp1b_0',`f32') define(`fp1b_1',`f33')
     88      1.1  mrg define(`fp1b_2',`f34') define(`fp1b_3',`f35')
     89      1.1  mrg 
     90      1.1  mrg define(`fp2a_0',`f36') define(`fp2a_1',`f37')
     91      1.1  mrg define(`fp2a_2',`f38') define(`fp2a_3',`f39')
     92      1.1  mrg 
     93      1.1  mrg define(`u_0',`f44') define(`u_1',`f45')
     94      1.1  mrg define(`u_2',`f46') define(`u_3',`f47')
     95      1.1  mrg 
     96      1.1  mrg define(`ux',`f49')
     97      1.1  mrg define(`uy',`f51')
     98      1.1  mrg 
     99      1.1  mrg ASM_START()
    100      1.1  mrg PROLOGUE(mpn_mul_2)
    101      1.1  mrg 	.prologue
    102      1.1  mrg 	.save	ar.lc, r2
    103      1.1  mrg 	.body
    104      1.1  mrg 
    105  1.1.1.2  mrg ifdef(`HAVE_ABI_32',`
    106  1.1.1.3  mrg  {.mmi;		addp4	rp = 0, rp		C			M I
    107  1.1.1.2  mrg 		addp4	up = 0, up		C			M I
    108  1.1.1.2  mrg 		addp4	vp = 0, vp		C			M I
    109  1.1.1.3  mrg }{.mmi;		nop	1
    110  1.1.1.2  mrg 		nop	1
    111  1.1.1.2  mrg 		zxt4	n = n			C			I
    112  1.1.1.3  mrg 	;;
    113  1.1.1.3  mrg }')
    114      1.1  mrg 
    115  1.1.1.3  mrg  {.mmi;		ldf8	ux = [up], 8		C			M
    116  1.1.1.2  mrg 		ldf8	v0 = [vp], 8		C			M
    117  1.1.1.2  mrg 		mov	r2 = ar.lc		C			I0
    118  1.1.1.3  mrg }{.mmi;		nop	1			C			M
    119  1.1.1.2  mrg 		and	r14 = 3, n		C			M I
    120  1.1.1.2  mrg 		add	n = -2, n		C			M I
    121  1.1.1.2  mrg 	;;
    122  1.1.1.3  mrg }{.mmi;		ldf8	uy = [up], 8		C			M
    123  1.1.1.2  mrg 		ldf8	v1 = [vp]		C			M
    124  1.1.1.3  mrg 		shr.u	n = n, 2		C			I0
    125  1.1.1.3  mrg }{.mmi;		nop	1			C			M
    126  1.1.1.2  mrg 		cmp.eq	p10, p0 = 1, r14	C			M I
    127  1.1.1.2  mrg 		cmp.eq	p11, p0 = 2, r14	C			M I
    128  1.1.1.2  mrg 	;;
    129  1.1.1.3  mrg }{.mmi;		nop	1			C			M
    130  1.1.1.2  mrg 		cmp.eq	p12, p0 = 3, r14	C			M I
    131  1.1.1.2  mrg 		mov	ar.lc = n		C			I0
    132  1.1.1.3  mrg }{.bbb;	(p10)	br.dptk	L(b01)			C			B
    133  1.1.1.2  mrg 	(p11)	br.dptk	L(b10)			C			B
    134  1.1.1.2  mrg 	(p12)	br.dptk	L(b11)			C			B
    135      1.1  mrg 	;;
    136  1.1.1.3  mrg }
    137      1.1  mrg 	ALIGN(32)
    138  1.1.1.2  mrg L(b00):		ldf8	u_1 = [up], 8
    139  1.1.1.2  mrg 		mov	acc1_2 = 0
    140  1.1.1.2  mrg 		mov	pr1_2 = 0
    141  1.1.1.2  mrg 		mov	pr0_3 = 0
    142  1.1.1.2  mrg 		cmp.ne	p8, p9 = r0, r0
    143  1.1.1.2  mrg 	;;
    144  1.1.1.2  mrg 		xma.l	fp0b_3 = ux, v0, f0
    145  1.1.1.2  mrg 		cmp.ne	p12, p13 = r0, r0
    146  1.1.1.2  mrg 		ldf8	u_2 = [up], 8
    147  1.1.1.2  mrg 		xma.hu	fp1a_3 = ux, v0, f0
    148  1.1.1.2  mrg 		br.cloop.dptk	L(gt4)
    149  1.1.1.2  mrg 
    150  1.1.1.2  mrg 		xma.l	fp0b_0 = uy, v0, f0
    151  1.1.1.2  mrg 		xma.hu	fp1a_0 = uy, v0, f0
    152  1.1.1.2  mrg 	;;
    153  1.1.1.2  mrg 		getfsig	acc0 = fp0b_3
    154  1.1.1.2  mrg 		xma.l	fp1b_3 = ux, v1, fp1a_3
    155  1.1.1.2  mrg 		xma.hu	fp2a_3 = ux, v1, fp1a_3
    156  1.1.1.2  mrg 	;;
    157  1.1.1.2  mrg 		xma.l	fp0b_1 = u_1, v0, f0
    158  1.1.1.2  mrg 		xma.hu	fp1a_1 = u_1, v0, f0
    159  1.1.1.2  mrg 	;;
    160  1.1.1.2  mrg 		getfsig	pr0_0 = fp0b_0
    161  1.1.1.2  mrg 		xma.l	fp1b_0 = uy, v1, fp1a_0
    162  1.1.1.2  mrg 		xma.hu	fp2a_0 = uy, v1, fp1a_0
    163  1.1.1.2  mrg 	;;
    164  1.1.1.2  mrg 		getfsig	pr1_3 = fp1b_3
    165  1.1.1.2  mrg 		getfsig	acc1_3 = fp2a_3
    166  1.1.1.2  mrg 		xma.l	fp0b_2 = u_2, v0, f0
    167  1.1.1.2  mrg 		xma.hu	fp1a_2 = u_2, v0, f0
    168  1.1.1.2  mrg 		br	L(cj4)
    169  1.1.1.2  mrg 
    170  1.1.1.2  mrg L(gt4):		xma.l	fp0b_0 = uy, v0, f0
    171  1.1.1.2  mrg 		xma.hu	fp1a_0 = uy, v0, f0
    172  1.1.1.2  mrg 	;;
    173  1.1.1.2  mrg 		getfsig	acc0 = fp0b_3
    174  1.1.1.2  mrg 		xma.l	fp1b_3 = ux, v1, fp1a_3
    175  1.1.1.2  mrg 		ldf8	u_3 = [up], 8
    176  1.1.1.2  mrg 		xma.hu	fp2a_3 = ux, v1, fp1a_3
    177  1.1.1.2  mrg 	;;
    178  1.1.1.2  mrg 		xma.l	fp0b_1 = u_1, v0, f0
    179  1.1.1.2  mrg 		xma.hu	fp1a_1 = u_1, v0, f0
    180  1.1.1.2  mrg 	;;
    181  1.1.1.2  mrg 		getfsig	pr0_0 = fp0b_0
    182  1.1.1.2  mrg 		xma.l	fp1b_0 = uy, v1, fp1a_0
    183  1.1.1.2  mrg 		xma.hu	fp2a_0 = uy, v1, fp1a_0
    184  1.1.1.2  mrg 	;;
    185  1.1.1.2  mrg 		ldf8	u_0 = [up], 8
    186  1.1.1.2  mrg 		getfsig	pr1_3 = fp1b_3
    187  1.1.1.2  mrg 		xma.l	fp0b_2 = u_2, v0, f0
    188  1.1.1.2  mrg 	;;
    189  1.1.1.2  mrg 		getfsig	acc1_3 = fp2a_3
    190  1.1.1.2  mrg 		xma.hu	fp1a_2 = u_2, v0, f0
    191  1.1.1.2  mrg 		br	L(00)
    192      1.1  mrg 
    193      1.1  mrg 
    194      1.1  mrg 	ALIGN(32)
    195  1.1.1.2  mrg L(b01):		ldf8	u_0 = [up], 8		C M
    196  1.1.1.2  mrg 		mov	acc1_1 = 0		C M I
    197  1.1.1.2  mrg 		mov	pr1_1 = 0		C M I
    198  1.1.1.2  mrg 		mov	pr0_2 = 0		C M I
    199  1.1.1.2  mrg 		cmp.ne	p6, p7 = r0, r0		C M I
    200  1.1.1.2  mrg 	;;
    201  1.1.1.2  mrg 		xma.l	fp0b_2 = ux, v0, f0	C F
    202  1.1.1.2  mrg 		cmp.ne	p10, p11 = r0, r0	C M I
    203  1.1.1.2  mrg 		ldf8	u_1 = [up], 8		C M
    204  1.1.1.2  mrg 		xma.hu	fp1a_2 = ux, v0, f0	C F
    205  1.1.1.2  mrg 	;;
    206  1.1.1.2  mrg 		xma.l	fp0b_3 = uy, v0, f0	C F
    207  1.1.1.2  mrg 		xma.hu	fp1a_3 = uy, v0, f0	C F
    208  1.1.1.2  mrg 	;;
    209  1.1.1.2  mrg 		getfsig	acc0 = fp0b_2		C M
    210  1.1.1.2  mrg 		xma.l	fp1b_2 = ux, v1,fp1a_2	C F
    211  1.1.1.2  mrg 		ldf8	u_2 = [up], 8		C M
    212  1.1.1.2  mrg 		xma.hu	fp2a_2 = ux, v1,fp1a_2	C F
    213  1.1.1.2  mrg 		br.cloop.dptk	L(gt5)
    214  1.1.1.2  mrg 
    215  1.1.1.2  mrg 		xma.l	fp0b_0 = u_0, v0, f0	C F
    216  1.1.1.2  mrg 		xma.hu	fp1a_0 = u_0, v0, f0	C F
    217  1.1.1.2  mrg 	;;
    218  1.1.1.2  mrg 		getfsig	pr0_3 = fp0b_3		C M
    219  1.1.1.2  mrg 		xma.l	fp1b_3 = uy, v1,fp1a_3	C F
    220  1.1.1.2  mrg 		xma.hu	fp2a_3 = uy, v1,fp1a_3	C F
    221  1.1.1.2  mrg 	;;
    222  1.1.1.2  mrg 		getfsig	pr1_2 = fp1b_2		C M
    223  1.1.1.2  mrg 		getfsig	acc1_2 = fp2a_2		C M
    224  1.1.1.2  mrg 		xma.l	fp0b_1 = u_1, v0, f0	C F
    225  1.1.1.2  mrg 		xma.hu	fp1a_1 = u_1, v0, f0	C F
    226  1.1.1.2  mrg 		br	L(cj5)
    227  1.1.1.2  mrg 
    228  1.1.1.2  mrg L(gt5):		xma.l	fp0b_0 = u_0, v0, f0
    229  1.1.1.2  mrg 		xma.hu	fp1a_0 = u_0, v0, f0
    230  1.1.1.2  mrg 	;;
    231  1.1.1.2  mrg 		getfsig	pr0_3 = fp0b_3
    232  1.1.1.2  mrg 		xma.l	fp1b_3 = uy, v1, fp1a_3
    233  1.1.1.2  mrg 		xma.hu	fp2a_3 = uy, v1, fp1a_3
    234  1.1.1.2  mrg 	;;
    235  1.1.1.2  mrg 		ldf8	u_3 = [up], 8
    236  1.1.1.2  mrg 		getfsig	pr1_2 = fp1b_2
    237  1.1.1.2  mrg 		xma.l	fp0b_1 = u_1, v0, f0
    238  1.1.1.2  mrg 	;;
    239  1.1.1.2  mrg 		getfsig	acc1_2 = fp2a_2
    240  1.1.1.2  mrg 		xma.hu	fp1a_1 = u_1, v0, f0
    241  1.1.1.2  mrg 		br	L(01)
    242      1.1  mrg 
    243      1.1  mrg 
    244      1.1  mrg 	ALIGN(32)
    245  1.1.1.2  mrg L(b10):		br.cloop.dptk	L(gt2)
    246  1.1.1.2  mrg 		xma.l	fp0b_1 = ux, v0, f0
    247  1.1.1.2  mrg 		xma.hu	fp1a_1 = ux, v0, f0
    248  1.1.1.2  mrg 	;;
    249  1.1.1.2  mrg 		xma.l	fp0b_2 = uy, v0, f0
    250  1.1.1.2  mrg 		xma.hu	fp1a_2 = uy, v0, f0
    251  1.1.1.2  mrg 	;;
    252  1.1.1.2  mrg 		stf8	[rp] = fp0b_1, 8
    253  1.1.1.2  mrg 		xma.l	fp1b_1 = ux, v1, fp1a_1
    254  1.1.1.2  mrg 		xma.hu	fp2a_1 = ux, v1, fp1a_1
    255  1.1.1.2  mrg 	;;
    256  1.1.1.2  mrg 		getfsig	acc0 = fp0b_2
    257  1.1.1.2  mrg 		xma.l	fp1b_2 = uy, v1, fp1a_2
    258  1.1.1.2  mrg 		xma.hu	fp2a_2 = uy, v1, fp1a_2
    259  1.1.1.2  mrg 	;;
    260  1.1.1.2  mrg 		getfsig	pr1_1 = fp1b_1
    261  1.1.1.2  mrg 		getfsig	acc1_1 = fp2a_1
    262  1.1.1.2  mrg 		mov	ar.lc = r2
    263  1.1.1.2  mrg 		getfsig	pr1_2 = fp1b_2
    264  1.1.1.2  mrg 		getfsig	r8 = fp2a_2
    265  1.1.1.2  mrg 	;;
    266  1.1.1.2  mrg 		add	s0 = pr1_1, acc0
    267  1.1.1.2  mrg 	;;
    268  1.1.1.2  mrg 		st8	[rp] = s0, 8
    269  1.1.1.2  mrg 		cmp.ltu	p8, p9 = s0, pr1_1
    270  1.1.1.2  mrg 		sub	r31 = -1, acc1_1
    271  1.1.1.2  mrg 	;;
    272  1.1.1.3  mrg 	.pred.rel "mutex", p8, p9
    273  1.1.1.2  mrg 	(p8)	add	acc0 = pr1_2, acc1_1, 1
    274  1.1.1.2  mrg 	(p9)	add	acc0 = pr1_2, acc1_1
    275  1.1.1.2  mrg 	(p8)	cmp.leu	p10, p0 = r31, pr1_2
    276  1.1.1.2  mrg 	(p9)	cmp.ltu	p10, p0 = r31, pr1_2
    277  1.1.1.2  mrg 	;;
    278  1.1.1.2  mrg 		st8	[rp] = acc0, 8
    279  1.1.1.2  mrg 	(p10)	add	r8 = 1, r8
    280  1.1.1.2  mrg 		br.ret.sptk.many b0
    281  1.1.1.2  mrg 
    282  1.1.1.2  mrg L(gt2):		ldf8	u_3 = [up], 8
    283  1.1.1.2  mrg 		mov	acc1_0 = 0
    284  1.1.1.2  mrg 		mov	pr1_0 = 0
    285  1.1.1.2  mrg 	;;
    286  1.1.1.2  mrg 		mov	pr0_1 = 0
    287  1.1.1.2  mrg 		xma.l	fp0b_1 = ux, v0, f0
    288  1.1.1.2  mrg 		ldf8	u_0 = [up], 8
    289  1.1.1.2  mrg 		xma.hu	fp1a_1 = ux, v0, f0
    290  1.1.1.2  mrg 	;;
    291  1.1.1.2  mrg 		xma.l	fp0b_2 = uy, v0, f0
    292  1.1.1.2  mrg 		xma.hu	fp1a_2 = uy, v0, f0
    293  1.1.1.2  mrg 	;;
    294  1.1.1.2  mrg 		getfsig	acc0 = fp0b_1
    295  1.1.1.2  mrg 		xma.l	fp1b_1 = ux, v1, fp1a_1
    296  1.1.1.2  mrg 		xma.hu	fp2a_1 = ux, v1, fp1a_1
    297  1.1.1.2  mrg 	;;
    298  1.1.1.2  mrg 		ldf8	u_1 = [up], 8
    299  1.1.1.2  mrg 		xma.l	fp0b_3 = u_3, v0, f0
    300  1.1.1.2  mrg 		xma.hu	fp1a_3 = u_3, v0, f0
    301  1.1.1.2  mrg 	;;
    302  1.1.1.2  mrg 		getfsig	pr0_2 = fp0b_2
    303  1.1.1.2  mrg 		xma.l	fp1b_2 = uy, v1, fp1a_2
    304  1.1.1.2  mrg 		xma.hu	fp2a_2 = uy, v1, fp1a_2
    305  1.1.1.2  mrg 	;;
    306  1.1.1.2  mrg 		ldf8	u_2 = [up], 8
    307  1.1.1.2  mrg 		getfsig	pr1_1 = fp1b_1
    308  1.1.1.2  mrg 	;;
    309  1.1.1.3  mrg  {.mfi;		getfsig	acc1_1 = fp2a_1
    310  1.1.1.2  mrg 		xma.l	fp0b_0 = u_0, v0, f0
    311  1.1.1.2  mrg 		cmp.ne	p8, p9 = r0, r0
    312  1.1.1.3  mrg }{.mfb;		cmp.ne	p12, p13 = r0, r0
    313  1.1.1.2  mrg 		xma.hu	fp1a_0 = u_0, v0, f0
    314  1.1.1.2  mrg 		br	L(10)
    315  1.1.1.3  mrg }
    316      1.1  mrg 
    317      1.1  mrg 	ALIGN(32)
    318  1.1.1.2  mrg L(b11):		mov	acc1_3 = 0
    319  1.1.1.2  mrg 		mov	pr1_3 = 0
    320  1.1.1.2  mrg 		mov	pr0_0 = 0
    321  1.1.1.2  mrg 		ldf8	u_2 = [up], 8
    322  1.1.1.2  mrg 		cmp.ne	p6, p7 = r0, r0
    323  1.1.1.2  mrg 		br.cloop.dptk	L(gt3)
    324  1.1.1.2  mrg 	;;
    325  1.1.1.2  mrg 		xma.l	fp0b_0 = ux, v0, f0
    326  1.1.1.2  mrg 		xma.hu	fp1a_0 = ux, v0, f0
    327  1.1.1.2  mrg 	;;
    328  1.1.1.2  mrg 		cmp.ne	p10, p11 = r0, r0
    329  1.1.1.2  mrg 		xma.l	fp0b_1 = uy, v0, f0
    330  1.1.1.2  mrg 		xma.hu	fp1a_1 = uy, v0, f0
    331  1.1.1.2  mrg 	;;
    332  1.1.1.2  mrg 		getfsig	acc0 = fp0b_0
    333  1.1.1.2  mrg 		xma.l	fp1b_0 = ux, v1, fp1a_0
    334  1.1.1.2  mrg 		xma.hu	fp2a_0 = ux, v1, fp1a_0
    335  1.1.1.2  mrg 	;;
    336  1.1.1.2  mrg 		xma.l	fp0b_2 = u_2, v0, f0
    337  1.1.1.2  mrg 		xma.hu	fp1a_2 = u_2, v0, f0
    338  1.1.1.2  mrg 	;;
    339  1.1.1.2  mrg 		getfsig	pr0_1 = fp0b_1
    340  1.1.1.2  mrg 		xma.l	fp1b_1 = uy, v1, fp1a_1
    341  1.1.1.2  mrg 		xma.hu	fp2a_1 = uy, v1, fp1a_1
    342  1.1.1.2  mrg 	;;
    343  1.1.1.2  mrg 		getfsig	pr1_0 = fp1b_0
    344  1.1.1.2  mrg 		getfsig	acc1_0 = fp2a_0
    345  1.1.1.2  mrg 		br	L(cj3)
    346  1.1.1.2  mrg 
    347  1.1.1.2  mrg L(gt3):		xma.l	fp0b_0 = ux, v0, f0
    348  1.1.1.2  mrg 		cmp.ne	p10, p11 = r0, r0
    349  1.1.1.2  mrg 		ldf8	u_3 = [up], 8
    350  1.1.1.2  mrg 		xma.hu	fp1a_0 = ux, v0, f0
    351  1.1.1.2  mrg 	;;
    352  1.1.1.2  mrg 		xma.l	fp0b_1 = uy, v0, f0
    353  1.1.1.2  mrg 		xma.hu	fp1a_1 = uy, v0, f0
    354  1.1.1.2  mrg 	;;
    355  1.1.1.2  mrg 		getfsig	acc0 = fp0b_0
    356  1.1.1.2  mrg 		xma.l	fp1b_0 = ux, v1, fp1a_0
    357  1.1.1.2  mrg 		ldf8	u_0 = [up], 8
    358  1.1.1.2  mrg 		xma.hu	fp2a_0 = ux, v1, fp1a_0
    359  1.1.1.2  mrg 	;;
    360  1.1.1.2  mrg 		xma.l	fp0b_2 = u_2, v0, f0
    361  1.1.1.2  mrg 		xma.hu	fp1a_2 = u_2, v0, f0
    362  1.1.1.2  mrg 	;;
    363  1.1.1.2  mrg 		getfsig	pr0_1 = fp0b_1
    364  1.1.1.2  mrg 		xma.l	fp1b_1 = uy, v1, fp1a_1
    365  1.1.1.2  mrg 		xma.hu	fp2a_1 = uy, v1, fp1a_1
    366  1.1.1.2  mrg 	;;
    367  1.1.1.2  mrg 		ldf8	u_1 = [up], 8
    368  1.1.1.2  mrg 		getfsig	pr1_0 = fp1b_0
    369  1.1.1.2  mrg 	;;
    370  1.1.1.2  mrg 		getfsig	acc1_0 = fp2a_0
    371  1.1.1.2  mrg 		xma.l	fp0b_3 = u_3, v0, f0
    372  1.1.1.2  mrg 		xma.hu	fp1a_3 = u_3, v0, f0
    373  1.1.1.2  mrg 		br	L(11)
    374      1.1  mrg 
    375      1.1  mrg 
    376      1.1  mrg C *** MAIN LOOP START ***
    377      1.1  mrg 	ALIGN(32)
    378  1.1.1.2  mrg L(top):						C 00
    379  1.1.1.3  mrg 	.pred.rel "mutex", p8, p9
    380  1.1.1.3  mrg 	.pred.rel "mutex", p12, p13
    381  1.1.1.2  mrg 		ldf8	u_3 = [up], 8
    382  1.1.1.2  mrg 		getfsig	pr1_2 = fp1b_2
    383  1.1.1.2  mrg 	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
    384  1.1.1.2  mrg 	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
    385  1.1.1.2  mrg 	(p12)	cmp.leu	p10, p11 = s0, pr1_0
    386  1.1.1.2  mrg 	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
    387      1.1  mrg 	;;					C 01
    388  1.1.1.3  mrg 	.pred.rel "mutex", p6, p7
    389  1.1.1.2  mrg 		getfsig	acc1_2 = fp2a_2
    390  1.1.1.2  mrg 		st8	[rp] = s0, 8
    391  1.1.1.2  mrg 		xma.l	fp0b_1 = u_1, v0, f0
    392  1.1.1.2  mrg 	(p6)	add	acc0 = pr0_2, acc1_0, 1
    393  1.1.1.2  mrg 	(p7)	add	acc0 = pr0_2, acc1_0
    394  1.1.1.2  mrg 		xma.hu	fp1a_1 = u_1, v0, f0
    395      1.1  mrg 	;;					C 02
    396  1.1.1.2  mrg L(01):
    397  1.1.1.3  mrg 	.pred.rel "mutex", p10, p11
    398  1.1.1.2  mrg 		getfsig	pr0_0 = fp0b_0
    399  1.1.1.2  mrg 		xma.l	fp1b_0 = u_0, v1, fp1a_0
    400  1.1.1.2  mrg 	(p10)	add	s0 = pr1_1, acc0, 1
    401  1.1.1.2  mrg 	(p11)	add	s0 = pr1_1, acc0
    402  1.1.1.2  mrg 		xma.hu	fp2a_0 = u_0, v1, fp1a_0
    403  1.1.1.2  mrg 		nop	1
    404      1.1  mrg 	;;					C 03
    405  1.1.1.3  mrg 	.pred.rel "mutex", p6, p7
    406  1.1.1.3  mrg 	.pred.rel "mutex", p10, p11
    407  1.1.1.2  mrg 		ldf8	u_0 = [up], 8
    408  1.1.1.2  mrg 		getfsig	pr1_3 = fp1b_3
    409  1.1.1.2  mrg 	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
    410  1.1.1.2  mrg 	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
    411  1.1.1.2  mrg 	(p10)	cmp.leu	p12, p13 = s0, pr1_1
    412  1.1.1.2  mrg 	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
    413      1.1  mrg 	;;					C 04
    414  1.1.1.3  mrg 	.pred.rel "mutex", p8, p9
    415  1.1.1.2  mrg 		getfsig	acc1_3 = fp2a_3
    416  1.1.1.2  mrg 		st8	[rp] = s0, 8
    417  1.1.1.2  mrg 		xma.l	fp0b_2 = u_2, v0, f0
    418  1.1.1.2  mrg 	(p8)	add	acc0 = pr0_3, acc1_1, 1
    419  1.1.1.2  mrg 	(p9)	add	acc0 = pr0_3, acc1_1
    420  1.1.1.2  mrg 		xma.hu	fp1a_2 = u_2, v0, f0
    421      1.1  mrg 	;;					C 05
    422  1.1.1.2  mrg L(00):
    423  1.1.1.3  mrg 	.pred.rel "mutex", p12, p13
    424  1.1.1.2  mrg 		getfsig	pr0_1 = fp0b_1
    425  1.1.1.2  mrg 		xma.l	fp1b_1 = u_1, v1, fp1a_1
    426  1.1.1.2  mrg 	(p12)	add	s0 = pr1_2, acc0, 1
    427  1.1.1.2  mrg 	(p13)	add	s0 = pr1_2, acc0
    428  1.1.1.2  mrg 		xma.hu	fp2a_1 = u_1, v1, fp1a_1
    429  1.1.1.2  mrg 		nop	1
    430      1.1  mrg 	;;					C 06
    431  1.1.1.3  mrg 	.pred.rel "mutex", p8, p9
    432  1.1.1.3  mrg 	.pred.rel "mutex", p12, p13
    433  1.1.1.2  mrg 		ldf8	u_1 = [up], 8
    434  1.1.1.2  mrg 		getfsig	pr1_0 = fp1b_0
    435  1.1.1.2  mrg 	(p8)	cmp.leu	p6, p7 = acc0, pr0_3
    436  1.1.1.2  mrg 	(p9)	cmp.ltu	p6, p7 = acc0, pr0_3
    437  1.1.1.2  mrg 	(p12)	cmp.leu	p10, p11 = s0, pr1_2
    438  1.1.1.2  mrg 	(p13)	cmp.ltu	p10, p11 = s0, pr1_2
    439      1.1  mrg 	;;					C 07
    440  1.1.1.3  mrg 	.pred.rel "mutex", p6, p7
    441  1.1.1.2  mrg 		getfsig	acc1_0 = fp2a_0
    442  1.1.1.2  mrg 		st8	[rp] = s0, 8
    443  1.1.1.2  mrg 		xma.l	fp0b_3 = u_3, v0, f0
    444  1.1.1.2  mrg 	(p6)	add	acc0 = pr0_0, acc1_2, 1
    445  1.1.1.2  mrg 	(p7)	add	acc0 = pr0_0, acc1_2
    446  1.1.1.2  mrg 		xma.hu	fp1a_3 = u_3, v0, f0
    447      1.1  mrg 	;;					C 08
    448  1.1.1.2  mrg L(11):
    449  1.1.1.3  mrg 	.pred.rel "mutex", p10, p11
    450  1.1.1.2  mrg 		getfsig	pr0_2 = fp0b_2
    451  1.1.1.2  mrg 		xma.l	fp1b_2 = u_2, v1, fp1a_2
    452  1.1.1.2  mrg 	(p10)	add	s0 = pr1_3, acc0, 1
    453  1.1.1.2  mrg 	(p11)	add	s0 = pr1_3, acc0
    454  1.1.1.2  mrg 		xma.hu	fp2a_2 = u_2, v1, fp1a_2
    455  1.1.1.2  mrg 		nop	1
    456      1.1  mrg 	;;					C 09
    457  1.1.1.3  mrg 	.pred.rel "mutex", p6, p7
    458  1.1.1.3  mrg 	.pred.rel "mutex", p10, p11
    459  1.1.1.2  mrg 		ldf8	u_2 = [up], 8
    460  1.1.1.2  mrg 		getfsig	pr1_1 = fp1b_1
    461  1.1.1.2  mrg 	(p6)	cmp.leu	p8, p9 = acc0, pr0_0
    462  1.1.1.2  mrg 	(p7)	cmp.ltu	p8, p9 = acc0, pr0_0
    463  1.1.1.2  mrg 	(p10)	cmp.leu	p12, p13 = s0, pr1_3
    464  1.1.1.2  mrg 	(p11)	cmp.ltu	p12, p13 = s0, pr1_3
    465      1.1  mrg 	;;					C 10
    466  1.1.1.3  mrg 	.pred.rel "mutex", p8, p9
    467  1.1.1.2  mrg 		getfsig	acc1_1 = fp2a_1
    468  1.1.1.2  mrg 		st8	[rp] = s0, 8
    469  1.1.1.2  mrg 		xma.l	fp0b_0 = u_0, v0, f0
    470  1.1.1.2  mrg 	(p8)	add	acc0 = pr0_1, acc1_3, 1
    471  1.1.1.2  mrg 	(p9)	add	acc0 = pr0_1, acc1_3
    472  1.1.1.2  mrg 		xma.hu	fp1a_0 = u_0, v0, f0
    473      1.1  mrg 	;;					C 11
    474  1.1.1.2  mrg L(10):
    475  1.1.1.3  mrg 	.pred.rel "mutex", p12, p13
    476  1.1.1.2  mrg 		getfsig	pr0_3 = fp0b_3
    477  1.1.1.2  mrg 		xma.l	fp1b_3 = u_3, v1, fp1a_3
    478  1.1.1.2  mrg 	(p12)	add	s0 = pr1_0, acc0, 1
    479  1.1.1.2  mrg 	(p13)	add	s0 = pr1_0, acc0
    480  1.1.1.2  mrg 		xma.hu	fp2a_3 = u_3, v1, fp1a_3
    481  1.1.1.2  mrg 		br.cloop.dptk	L(top)
    482      1.1  mrg 	;;
    483      1.1  mrg C *** MAIN LOOP END ***
    484      1.1  mrg 
    485  1.1.1.3  mrg 	.pred.rel "mutex", p8, p9
    486  1.1.1.3  mrg 	.pred.rel "mutex", p12, p13
    487  1.1.1.3  mrg  {.mmi;		getfsig	pr1_2 = fp1b_2
    488  1.1.1.2  mrg 		st8	[rp] = s0, 8
    489  1.1.1.2  mrg 	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
    490  1.1.1.3  mrg }{.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
    491  1.1.1.2  mrg 	(p12)	cmp.leu	p10, p11 = s0, pr1_0
    492  1.1.1.2  mrg 	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
    493  1.1.1.2  mrg 	;;
    494  1.1.1.3  mrg }	.pred.rel "mutex", p6, p7
    495  1.1.1.3  mrg  {.mfi;		getfsig	acc1_2 = fp2a_2
    496  1.1.1.2  mrg 		xma.l	fp0b_1 = u_1, v0, f0
    497  1.1.1.2  mrg 		nop	1
    498  1.1.1.3  mrg }{.mmf;	(p6)	add	acc0 = pr0_2, acc1_0, 1
    499  1.1.1.2  mrg 	(p7)	add	acc0 = pr0_2, acc1_0
    500  1.1.1.2  mrg 		xma.hu	fp1a_1 = u_1, v0, f0
    501  1.1.1.2  mrg 	;;
    502  1.1.1.3  mrg }
    503  1.1.1.2  mrg L(cj5):
    504  1.1.1.3  mrg 	.pred.rel "mutex", p10, p11
    505  1.1.1.3  mrg  {.mfi;		getfsig	pr0_0 = fp0b_0
    506  1.1.1.2  mrg 		xma.l	fp1b_0 = u_0, v1, fp1a_0
    507  1.1.1.2  mrg 	(p10)	add	s0 = pr1_1, acc0, 1
    508  1.1.1.3  mrg }{.mfi;	(p11)	add	s0 = pr1_1, acc0
    509  1.1.1.2  mrg 		xma.hu	fp2a_0 = u_0, v1, fp1a_0
    510  1.1.1.2  mrg 		nop	1
    511  1.1.1.2  mrg 	;;
    512  1.1.1.3  mrg }	.pred.rel "mutex", p6, p7
    513  1.1.1.3  mrg 	.pred.rel "mutex", p10, p11
    514  1.1.1.3  mrg  {.mmi;		getfsig	pr1_3 = fp1b_3
    515  1.1.1.2  mrg 		st8	[rp] = s0, 8
    516  1.1.1.2  mrg 	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
    517  1.1.1.3  mrg }{.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
    518  1.1.1.2  mrg 	(p10)	cmp.leu	p12, p13 = s0, pr1_1
    519  1.1.1.2  mrg 	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
    520  1.1.1.2  mrg 	;;
    521  1.1.1.3  mrg }	.pred.rel "mutex", p8, p9
    522  1.1.1.3  mrg  {.mfi;		getfsig	acc1_3 = fp2a_3
    523  1.1.1.2  mrg 		xma.l	fp0b_2 = u_2, v0, f0
    524  1.1.1.2  mrg 		nop	1
    525  1.1.1.3  mrg }{.mmf;	(p8)	add	acc0 = pr0_3, acc1_1, 1
    526  1.1.1.2  mrg 	(p9)	add	acc0 = pr0_3, acc1_1
    527  1.1.1.2  mrg 		xma.hu	fp1a_2 = u_2, v0, f0
    528  1.1.1.2  mrg 	;;
    529  1.1.1.3  mrg }
    530  1.1.1.2  mrg L(cj4):
    531  1.1.1.3  mrg 	.pred.rel "mutex", p12, p13
    532  1.1.1.3  mrg  {.mfi;		getfsig	pr0_1 = fp0b_1
    533  1.1.1.2  mrg 		xma.l	fp1b_1 = u_1, v1, fp1a_1
    534  1.1.1.2  mrg 	(p12)	add	s0 = pr1_2, acc0, 1
    535  1.1.1.3  mrg }{.mfi;	(p13)	add	s0 = pr1_2, acc0
    536  1.1.1.2  mrg 		xma.hu	fp2a_1 = u_1, v1, fp1a_1
    537  1.1.1.2  mrg 		nop	1
    538  1.1.1.2  mrg 	;;
    539  1.1.1.3  mrg }	.pred.rel "mutex", p8, p9
    540  1.1.1.3  mrg 	.pred.rel "mutex", p12, p13
    541  1.1.1.3  mrg  {.mmi;		getfsig	pr1_0 = fp1b_0
    542  1.1.1.2  mrg 		st8	[rp] = s0, 8
    543  1.1.1.2  mrg 	(p8)	cmp.leu	p6, p7 = acc0, pr0_3
    544  1.1.1.3  mrg }{.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_3
    545  1.1.1.2  mrg 	(p12)	cmp.leu	p10, p11 = s0, pr1_2
    546  1.1.1.2  mrg 	(p13)	cmp.ltu	p10, p11 = s0, pr1_2
    547  1.1.1.2  mrg 	;;
    548  1.1.1.3  mrg }	.pred.rel "mutex", p6, p7
    549  1.1.1.3  mrg  {.mmi;		getfsig	acc1_0 = fp2a_0
    550  1.1.1.2  mrg 	(p6)	add	acc0 = pr0_0, acc1_2, 1
    551  1.1.1.2  mrg 	(p7)	add	acc0 = pr0_0, acc1_2
    552  1.1.1.2  mrg 	;;
    553  1.1.1.3  mrg }
    554  1.1.1.2  mrg L(cj3):
    555  1.1.1.3  mrg 	.pred.rel "mutex", p10, p11
    556  1.1.1.3  mrg  {.mfi;		getfsig	pr0_2 = fp0b_2
    557  1.1.1.2  mrg 		xma.l	fp1b_2 = u_2, v1, fp1a_2
    558  1.1.1.2  mrg 	(p10)	add	s0 = pr1_3, acc0, 1
    559  1.1.1.3  mrg }{.mfi;	(p11)	add	s0 = pr1_3, acc0
    560  1.1.1.2  mrg 		xma.hu	fp2a_2 = u_2, v1, fp1a_2
    561  1.1.1.2  mrg 		nop	1
    562  1.1.1.2  mrg 	;;
    563  1.1.1.3  mrg }	.pred.rel "mutex", p6, p7
    564  1.1.1.3  mrg 	.pred.rel "mutex", p10, p11
    565  1.1.1.3  mrg  {.mmi;		getfsig	pr1_1 = fp1b_1
    566  1.1.1.2  mrg 		st8	[rp] = s0, 8
    567  1.1.1.2  mrg 	(p6)	cmp.leu	p8, p9 = acc0, pr0_0
    568  1.1.1.3  mrg }{.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_0
    569  1.1.1.2  mrg 	(p10)	cmp.leu	p12, p13 = s0, pr1_3
    570  1.1.1.2  mrg 	(p11)	cmp.ltu	p12, p13 = s0, pr1_3
    571  1.1.1.2  mrg 	;;
    572  1.1.1.3  mrg }	.pred.rel "mutex", p8, p9
    573  1.1.1.3  mrg  {.mmi;		getfsig	acc1_1 = fp2a_1
    574  1.1.1.2  mrg 	(p8)	add	acc0 = pr0_1, acc1_3, 1
    575  1.1.1.2  mrg 	(p9)	add	acc0 = pr0_1, acc1_3
    576  1.1.1.2  mrg 	;;
    577  1.1.1.3  mrg }	.pred.rel "mutex", p12, p13
    578  1.1.1.3  mrg  {.mmi;	(p12)	add	s0 = pr1_0, acc0, 1
    579  1.1.1.2  mrg 	(p13)	add	s0 = pr1_0, acc0
    580  1.1.1.2  mrg 		nop	1
    581  1.1.1.2  mrg 	;;
    582  1.1.1.3  mrg }	.pred.rel "mutex", p8, p9
    583  1.1.1.3  mrg 	.pred.rel "mutex", p12, p13
    584  1.1.1.3  mrg  {.mmi;		getfsig	pr1_2 = fp1b_2
    585  1.1.1.2  mrg 		st8	[rp] = s0, 8
    586  1.1.1.2  mrg 	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
    587  1.1.1.3  mrg }{.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
    588  1.1.1.2  mrg 	(p12)	cmp.leu	p10, p11 = s0, pr1_0
    589  1.1.1.2  mrg 	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
    590  1.1.1.2  mrg 	;;
    591  1.1.1.3  mrg }	.pred.rel "mutex", p6, p7
    592  1.1.1.3  mrg  {.mmi;		getfsig	r8 = fp2a_2
    593  1.1.1.2  mrg 	(p6)	add	acc0 = pr0_2, acc1_0, 1
    594  1.1.1.2  mrg 	(p7)	add	acc0 = pr0_2, acc1_0
    595  1.1.1.2  mrg 	;;
    596  1.1.1.3  mrg }	.pred.rel "mutex", p10, p11
    597  1.1.1.3  mrg  {.mmi;	(p10)	add	s0 = pr1_1, acc0, 1
    598  1.1.1.2  mrg 	(p11)	add	s0 = pr1_1, acc0
    599  1.1.1.2  mrg 	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
    600  1.1.1.2  mrg 	;;
    601  1.1.1.3  mrg }	.pred.rel "mutex", p10, p11
    602  1.1.1.3  mrg  {.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
    603  1.1.1.2  mrg 	(p10)	cmp.leu	p12, p13 = s0, pr1_1
    604  1.1.1.2  mrg 	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
    605  1.1.1.2  mrg 	;;
    606  1.1.1.3  mrg }	.pred.rel "mutex", p8, p9
    607  1.1.1.3  mrg  {.mmi;		st8	[rp] = s0, 8
    608  1.1.1.2  mrg 	(p8)	add	acc0 = pr1_2, acc1_1, 1
    609  1.1.1.2  mrg 	(p9)	add	acc0 = pr1_2, acc1_1
    610  1.1.1.2  mrg 	;;
    611  1.1.1.3  mrg }	.pred.rel "mutex", p8, p9
    612  1.1.1.3  mrg  {.mmi;	(p8)	cmp.leu	p10, p11 = acc0, pr1_2
    613  1.1.1.2  mrg 	(p9)	cmp.ltu	p10, p11 = acc0, pr1_2
    614  1.1.1.2  mrg 	(p12)	add	acc0 = 1, acc0
    615  1.1.1.2  mrg 	;;
    616  1.1.1.3  mrg }{.mmi;		st8	[rp] = acc0, 8
    617  1.1.1.2  mrg 	(p12)	cmpeqor	p10, p0 = 0, acc0
    618  1.1.1.2  mrg 		nop	1
    619  1.1.1.2  mrg 	;;
    620  1.1.1.3  mrg }{.mib;	(p10)	add	r8 = 1, r8
    621  1.1.1.2  mrg 		mov	ar.lc = r2
    622  1.1.1.2  mrg 		br.ret.sptk.many b0
    623  1.1.1.3  mrg }
    624      1.1  mrg EPILOGUE()
    625      1.1  mrg ASM_END()
    626