Home | History | Annotate | Line # | Download | only in ia64
mul_2.asm revision 1.1
      1  1.1  mrg dnl  IA-64 mpn_mul_2 -- Multiply a n-limb number with a 2-limb number and store
      2  1.1  mrg dnl  store the result to a (n+1)-limb number.
      3  1.1  mrg 
      4  1.1  mrg dnl  Copyright 2004 Free Software Foundation, Inc.
      5  1.1  mrg 
      6  1.1  mrg dnl  This file is part of the GNU MP Library.
      7  1.1  mrg 
      8  1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      9  1.1  mrg dnl  it under the terms of the GNU Lesser General Public License as published
     10  1.1  mrg dnl  by the Free Software Foundation; either version 3 of the License, or (at
     11  1.1  mrg dnl  your option) any later version.
     12  1.1  mrg 
     13  1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     14  1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     15  1.1  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     16  1.1  mrg dnl  License for more details.
     17  1.1  mrg 
     18  1.1  mrg dnl  You should have received a copy of the GNU Lesser General Public License
     19  1.1  mrg dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     20  1.1  mrg 
     21  1.1  mrg include(`../config.m4')
     22  1.1  mrg 
     23  1.1  mrg C         cycles/limb
     24  1.1  mrg C Itanium:    3.15
     25  1.1  mrg C Itanium 2:  1.625
     26  1.1  mrg 
     27  1.1  mrg C Note that this is very similar to addmul_2.asm.  If you change this file,
     28  1.1  mrg C please change that file too.
     29  1.1  mrg 
     30  1.1  mrg C TODO
     31  1.1  mrg C  * Clean up variable names, and try to decrease the number of distinct
     32  1.1  mrg C    registers used.
     33  1.1  mrg C  * Cleanup feed-in code to not require zeroing several registers.
     34  1.1  mrg C  * Make sure we don't depend on uninitialized predicate registers.
     35  1.1  mrg C  * We currently cross-jump very aggressively, at the expense of a few cycles
     36  1.1  mrg C    per operation.  Consider changing that.
     37  1.1  mrg C  * Could perhaps save a few cycles by using 1 c/l carry propagation in
     38  1.1  mrg C    wind-down code.
     39  1.1  mrg C  * Ultimately rewrite.  The problem with this code is that it first uses a
     40  1.1  mrg C    loaded u value in one xma pair, then leaves it live over several unrelated
     41  1.1  mrg C    xma pairs, before it uses it again.  It should actually be quite possible
     42  1.1  mrg C    to just swap some aligned xma pairs around.  But we should then schedule
     43  1.1  mrg C    u loads further from the first use.
     44  1.1  mrg 
     45  1.1  mrg C INPUT PARAMETERS
     46  1.1  mrg define(`rp',`r32')
     47  1.1  mrg define(`up',`r33')
     48  1.1  mrg define(`n',`r34')
     49  1.1  mrg define(`vp',`r35')
     50  1.1  mrg 
     51  1.1  mrg define(`srp',`r3')
     52  1.1  mrg 
     53  1.1  mrg define(`v0',`f6')
     54  1.1  mrg define(`v1',`f7')
     55  1.1  mrg 
     56  1.1  mrg define(`s0',`r14')
     57  1.1  mrg define(`acc0',`r15')
     58  1.1  mrg 
     59  1.1  mrg define(`pr0_0',`r16') define(`pr0_1',`r17')
     60  1.1  mrg define(`pr0_2',`r18') define(`pr0_3',`r19')
     61  1.1  mrg 
     62  1.1  mrg define(`pr1_0',`r20') define(`pr1_1',`r21')
     63  1.1  mrg define(`pr1_2',`r22') define(`pr1_3',`r23')
     64  1.1  mrg 
     65  1.1  mrg define(`acc1_0',`r24') define(`acc1_1',`r25')
     66  1.1  mrg define(`acc1_2',`r26') define(`acc1_3',`r27')
     67  1.1  mrg 
     68  1.1  mrg dnl define(`',`r28')
     69  1.1  mrg dnl define(`',`r29')
     70  1.1  mrg dnl define(`',`r30')
     71  1.1  mrg dnl define(`',`r31')
     72  1.1  mrg 
     73  1.1  mrg define(`fp0b_0',`f8') define(`fp0b_1',`f9')
     74  1.1  mrg define(`fp0b_2',`f10') define(`fp0b_3',`f11')
     75  1.1  mrg 
     76  1.1  mrg define(`fp1a_0',`f12') define(`fp1a_1',`f13')
     77  1.1  mrg define(`fp1a_2',`f14') define(`fp1a_3',`f15')
     78  1.1  mrg 
     79  1.1  mrg define(`fp1b_0',`f32') define(`fp1b_1',`f33')
     80  1.1  mrg define(`fp1b_2',`f34') define(`fp1b_3',`f35')
     81  1.1  mrg 
     82  1.1  mrg define(`fp2a_0',`f36') define(`fp2a_1',`f37')
     83  1.1  mrg define(`fp2a_2',`f38') define(`fp2a_3',`f39')
     84  1.1  mrg 
     85  1.1  mrg define(`u_0',`f44') define(`u_1',`f45')
     86  1.1  mrg define(`u_2',`f46') define(`u_3',`f47')
     87  1.1  mrg 
     88  1.1  mrg define(`ux',`f49')
     89  1.1  mrg define(`uy',`f51')
     90  1.1  mrg 
     91  1.1  mrg ASM_START()
     92  1.1  mrg PROLOGUE(mpn_mul_2)
     93  1.1  mrg 	.prologue
     94  1.1  mrg 	.save	ar.lc, r2
     95  1.1  mrg 	.body
     96  1.1  mrg 
     97  1.1  mrg ifdef(`HAVE_ABI_32',
     98  1.1  mrg `	addp4		rp = 0, rp		C			M I
     99  1.1  mrg 	addp4		up = 0, up		C			M I
    100  1.1  mrg 	addp4		vp = 0, vp		C			M I
    101  1.1  mrg 	zxt4		n = n			C			I
    102  1.1  mrg 	;;')
    103  1.1  mrg 
    104  1.1  mrg {.mmi		C 00
    105  1.1  mrg 	ldf8		ux = [up], 8		C			M
    106  1.1  mrg 	ldf8		v0 = [vp], 8		C			M
    107  1.1  mrg 	mov.i		r2 = ar.lc		C			I0
    108  1.1  mrg }{.mmi
    109  1.1  mrg 	nop		0			C			M
    110  1.1  mrg 	and		r14 = 3, n		C			M I
    111  1.1  mrg 	add		n = -2, n		C			M I
    112  1.1  mrg 	;;
    113  1.1  mrg }{.mmi		C 01
    114  1.1  mrg 	ldf8		uy = [up], 8		C			M
    115  1.1  mrg 	ldf8		v1 = [vp]		C			M
    116  1.1  mrg 	shr.u		n = n, 2		C			I
    117  1.1  mrg }{.mmi
    118  1.1  mrg 	nop		0			C			M
    119  1.1  mrg 	cmp.eq		p10, p0 = 1, r14	C			M I
    120  1.1  mrg 	cmp.eq		p11, p0 = 2, r14	C			M I
    121  1.1  mrg 	;;
    122  1.1  mrg }{.mmi		C 02
    123  1.1  mrg 	nop		0			C			M
    124  1.1  mrg 	cmp.eq		p12, p0 = 3, r14	C			M I
    125  1.1  mrg 	mov.i		ar.lc = n		C			I0
    126  1.1  mrg }{.bbb
    127  1.1  mrg   (p10) br.dptk		.Lb01			C			B
    128  1.1  mrg   (p11) br.dptk		.Lb10			C			B
    129  1.1  mrg   (p12) br.dptk		.Lb11			C			B
    130  1.1  mrg 	;;
    131  1.1  mrg }
    132  1.1  mrg 
    133  1.1  mrg 	ALIGN(32)
    134  1.1  mrg .Lb00:	ldf8		u_1 = [up], 8
    135  1.1  mrg 	mov		acc1_2 = 0
    136  1.1  mrg 	mov		pr1_2 = 0
    137  1.1  mrg 	mov		pr0_3 = 0
    138  1.1  mrg 	cmp.ne		p8, p9 = r0, r0
    139  1.1  mrg 	;;
    140  1.1  mrg 	xma.l		fp0b_3 = ux, v0, f0
    141  1.1  mrg 	cmp.ne		p12, p13 = r0, r0
    142  1.1  mrg 	ldf8		u_2 = [up], 8
    143  1.1  mrg 	xma.hu		fp1a_3 = ux, v0, f0
    144  1.1  mrg 	br.cloop.dptk	.grt4
    145  1.1  mrg 
    146  1.1  mrg 	xma.l		fp0b_0 = uy, v0, f0
    147  1.1  mrg 	xma.hu		fp1a_0 = uy, v0, f0
    148  1.1  mrg 	;;
    149  1.1  mrg 	getf.sig	acc0 = fp0b_3
    150  1.1  mrg 	xma.l		fp1b_3 = ux, v1, fp1a_3
    151  1.1  mrg 	xma.hu		fp2a_3 = ux, v1, fp1a_3
    152  1.1  mrg 	;;
    153  1.1  mrg 	xma.l		fp0b_1 = u_1, v0, f0
    154  1.1  mrg 	xma.hu		fp1a_1 = u_1, v0, f0
    155  1.1  mrg 	;;
    156  1.1  mrg 	getf.sig	pr0_0 = fp0b_0
    157  1.1  mrg 	xma.l		fp1b_0 = uy, v1, fp1a_0
    158  1.1  mrg 	xma.hu		fp2a_0 = uy, v1, fp1a_0
    159  1.1  mrg 	;;
    160  1.1  mrg 	getf.sig	pr1_3 = fp1b_3
    161  1.1  mrg 	getf.sig	acc1_3 = fp2a_3
    162  1.1  mrg 	xma.l		fp0b_2 = u_2, v0, f0
    163  1.1  mrg 	xma.hu		fp1a_2 = u_2, v0, f0
    164  1.1  mrg 	br		.Lcj4
    165  1.1  mrg 
    166  1.1  mrg .grt4:	xma.l		fp0b_0 = uy, v0, f0
    167  1.1  mrg 	xma.hu		fp1a_0 = uy, v0, f0
    168  1.1  mrg 	;;
    169  1.1  mrg 	getf.sig	acc0 = fp0b_3
    170  1.1  mrg 	xma.l		fp1b_3 = ux, v1, fp1a_3
    171  1.1  mrg 	ldf8		u_3 = [up], 8
    172  1.1  mrg 	xma.hu		fp2a_3 = ux, v1, fp1a_3
    173  1.1  mrg 	;;
    174  1.1  mrg 	xma.l		fp0b_1 = u_1, v0, f0
    175  1.1  mrg 	xma.hu		fp1a_1 = u_1, v0, f0
    176  1.1  mrg 	;;
    177  1.1  mrg 	getf.sig	pr0_0 = fp0b_0
    178  1.1  mrg 	xma.l		fp1b_0 = uy, v1, fp1a_0
    179  1.1  mrg 	xma.hu		fp2a_0 = uy, v1, fp1a_0
    180  1.1  mrg 	;;
    181  1.1  mrg 	ldf8		u_0 = [up], 8
    182  1.1  mrg 	getf.sig	pr1_3 = fp1b_3
    183  1.1  mrg 	;;
    184  1.1  mrg 	getf.sig	acc1_3 = fp2a_3
    185  1.1  mrg 	xma.l		fp0b_2 = u_2, v0, f0
    186  1.1  mrg 	xma.hu		fp1a_2 = u_2, v0, f0
    187  1.1  mrg 	br		.LL00
    188  1.1  mrg 
    189  1.1  mrg 
    190  1.1  mrg 	ALIGN(32)
    191  1.1  mrg .Lb01:	ldf8		u_0 = [up], 8		C M
    192  1.1  mrg 	mov		acc1_1 = 0		C M I
    193  1.1  mrg 	mov		pr1_1 = 0		C M I
    194  1.1  mrg 	mov		pr0_2 = 0		C M I
    195  1.1  mrg 	cmp.ne		p6, p7 = r0, r0		C M I
    196  1.1  mrg 	;;
    197  1.1  mrg 	xma.l		fp0b_2 = ux, v0, f0	C F
    198  1.1  mrg 	cmp.ne		p10, p11 = r0, r0	C M I
    199  1.1  mrg 	ldf8		u_1 = [up], 8		C M
    200  1.1  mrg 	xma.hu		fp1a_2 = ux, v0, f0	C F
    201  1.1  mrg 	;;
    202  1.1  mrg 	xma.l		fp0b_3 = uy, v0, f0	C F
    203  1.1  mrg 	xma.hu		fp1a_3 = uy, v0, f0	C F
    204  1.1  mrg 	;;
    205  1.1  mrg 	getf.sig	acc0 = fp0b_2		C M
    206  1.1  mrg 	xma.l		fp1b_2 = ux, v1,fp1a_2	C F
    207  1.1  mrg 	xma.hu		fp2a_2 = ux, v1,fp1a_2	C F
    208  1.1  mrg 	ldf8		u_2 = [up], 8		C M
    209  1.1  mrg 	br.cloop.dptk	.grt5
    210  1.1  mrg 
    211  1.1  mrg 	xma.l		fp0b_0 = u_0, v0, f0	C F
    212  1.1  mrg 	xma.hu		fp1a_0 = u_0, v0, f0	C F
    213  1.1  mrg 	;;
    214  1.1  mrg 	getf.sig	pr0_3 = fp0b_3		C M
    215  1.1  mrg 	xma.l		fp1b_3 = uy, v1,fp1a_3	C F
    216  1.1  mrg 	xma.hu		fp2a_3 = uy, v1,fp1a_3	C F
    217  1.1  mrg 	;;
    218  1.1  mrg 	getf.sig	pr1_2 = fp1b_2		C M
    219  1.1  mrg 	getf.sig	acc1_2 = fp2a_2		C M
    220  1.1  mrg 	xma.l		fp0b_1 = u_1, v0, f0	C F
    221  1.1  mrg 	xma.hu		fp1a_1 = u_1, v0, f0	C F
    222  1.1  mrg 	br		.Lcj5
    223  1.1  mrg 
    224  1.1  mrg .grt5:	xma.l		fp0b_0 = u_0, v0, f0
    225  1.1  mrg 	xma.hu		fp1a_0 = u_0, v0, f0
    226  1.1  mrg 	;;
    227  1.1  mrg 	getf.sig	pr0_3 = fp0b_3
    228  1.1  mrg 	xma.l		fp1b_3 = uy, v1, fp1a_3
    229  1.1  mrg 	xma.hu		fp2a_3 = uy, v1, fp1a_3
    230  1.1  mrg 	;;
    231  1.1  mrg 	ldf8		u_3 = [up], 8
    232  1.1  mrg 	getf.sig	pr1_2 = fp1b_2
    233  1.1  mrg 	;;
    234  1.1  mrg 	getf.sig	acc1_2 = fp2a_2
    235  1.1  mrg 	xma.l		fp0b_1 = u_1, v0, f0
    236  1.1  mrg 	xma.hu		fp1a_1 = u_1, v0, f0
    237  1.1  mrg 	br		.LL01
    238  1.1  mrg 
    239  1.1  mrg 
    240  1.1  mrg C We have two variants for n = 2.  They turn out to run at exactly the same
    241  1.1  mrg C speed.  But the first, odd variant might allow one cycle to be trimmed.
    242  1.1  mrg 	ALIGN(32)
    243  1.1  mrg ifdef(`',`
    244  1.1  mrg .Lb10:		C 03
    245  1.1  mrg 	br.cloop.dptk	.grt2
    246  1.1  mrg 		C 04
    247  1.1  mrg 		C 05
    248  1.1  mrg 		C 06
    249  1.1  mrg 	xma.l		fp0b_1 = ux, v0, f0	C 0
    250  1.1  mrg 	xma.hu		fp1a_1 = ux, v0, f0	C 1
    251  1.1  mrg 	;;	C 07
    252  1.1  mrg 	xma.l		fp0b_2 = uy, v0, f0	C 1
    253  1.1  mrg 	xma.l		fp1b_1 = ux, v1, f0	C 1
    254  1.1  mrg 	;;	C 08
    255  1.1  mrg 	xma.hu		fp1a_2 = uy, v0, f0	C 2
    256  1.1  mrg 	xma.hu		fp2a_1 = ux, v1, f0	C 2
    257  1.1  mrg 	;;	C 09
    258  1.1  mrg 	xma.l		fp1b_2 = uy, v1, f0	C 2
    259  1.1  mrg 	xma.hu		fp2a_2 = uy, v1, f0	C 3
    260  1.1  mrg 	;;	C 10
    261  1.1  mrg 	getf.sig	r16 = fp1a_1
    262  1.1  mrg 	stf8		[rp] = fp0b_1, 8
    263  1.1  mrg 	;;	C 11
    264  1.1  mrg 	getf.sig	r17 = fp0b_2
    265  1.1  mrg 		C 12
    266  1.1  mrg 	getf.sig	r18 = fp1b_1
    267  1.1  mrg 		C 13
    268  1.1  mrg 	getf.sig	r19 = fp1a_2
    269  1.1  mrg 		C 14
    270  1.1  mrg 	getf.sig	r20 = fp2a_1
    271  1.1  mrg 		C 15
    272  1.1  mrg 	getf.sig	r21 = fp1b_2
    273  1.1  mrg 	;;	C 16
    274  1.1  mrg 	getf.sig	r8 = fp2a_2
    275  1.1  mrg 	add		r24 = r16, r17
    276  1.1  mrg 	;;	C 17
    277  1.1  mrg 	cmp.ltu		p6, p7 = r24, r16
    278  1.1  mrg 	add		r26 = r24, r18
    279  1.1  mrg 	;;	C 18
    280  1.1  mrg 	cmp.ltu		p8, p9 = r26, r24
    281  1.1  mrg 	;;	C 19
    282  1.1  mrg 	st8		[rp] = r26, 8
    283  1.1  mrg   (p6)	add		r25 = r19, r20, 1
    284  1.1  mrg   (p7)	add		r25 = r19, r20
    285  1.1  mrg 	;;	C 20
    286  1.1  mrg   (p8)	add		r27 = r25, r21, 1
    287  1.1  mrg   (p9)	add		r27 = r25, r21
    288  1.1  mrg   (p6)	cmp.leu		p10, p0 = r25, r19
    289  1.1  mrg   (p7)	cmp.ltu		p10, p0 = r25, r19
    290  1.1  mrg 	;;	C 21
    291  1.1  mrg   (p10)	add		r8 = 1, r8
    292  1.1  mrg   (p8)	cmp.leu		p12, p0 = r27, r25
    293  1.1  mrg   (p9)	cmp.ltu		p12, p0 = r27, r25
    294  1.1  mrg 	;;	C 22
    295  1.1  mrg 	st8		[rp] = r27, 8
    296  1.1  mrg 	mov.i		ar.lc = r2
    297  1.1  mrg   (p12)	add		r8 = 1, r8
    298  1.1  mrg 	br.ret.sptk.many b0
    299  1.1  mrg ')
    300  1.1  mrg 
    301  1.1  mrg .Lb10:		C 03
    302  1.1  mrg 	br.cloop.dptk	.grt2
    303  1.1  mrg 		C 04
    304  1.1  mrg 		C 05
    305  1.1  mrg 		C 06
    306  1.1  mrg 	xma.l		fp0b_1 = ux, v0, f0
    307  1.1  mrg 	xma.hu		fp1a_1 = ux, v0, f0
    308  1.1  mrg 	;;	C 07
    309  1.1  mrg 	xma.l		fp0b_2 = uy, v0, f0
    310  1.1  mrg 	xma.hu		fp1a_2 = uy, v0, f0
    311  1.1  mrg 	;;	C 08
    312  1.1  mrg 		C 09
    313  1.1  mrg 		C 10
    314  1.1  mrg 	stf8		[rp] = fp0b_1, 8
    315  1.1  mrg 	xma.l		fp1b_1 = ux, v1, fp1a_1
    316  1.1  mrg 	xma.hu		fp2a_1 = ux, v1, fp1a_1
    317  1.1  mrg 	;;	C 11
    318  1.1  mrg 	getf.sig	acc0 = fp0b_2
    319  1.1  mrg 	xma.l		fp1b_2 = uy, v1, fp1a_2
    320  1.1  mrg 	xma.hu		fp2a_2 = uy, v1, fp1a_2
    321  1.1  mrg 	;;	C 12
    322  1.1  mrg 		C 13
    323  1.1  mrg 		C 14
    324  1.1  mrg 	getf.sig	pr1_1 = fp1b_1
    325  1.1  mrg 		C 15
    326  1.1  mrg 	getf.sig	acc1_1 = fp2a_1
    327  1.1  mrg 		C 16
    328  1.1  mrg 	getf.sig	pr1_2 = fp1b_2
    329  1.1  mrg 		C 17
    330  1.1  mrg 	getf.sig	r8 = fp2a_2
    331  1.1  mrg 	;;	C 18
    332  1.1  mrg 		C 19
    333  1.1  mrg 	add		s0 = pr1_1, acc0
    334  1.1  mrg 	;;	C 20
    335  1.1  mrg 	st8		[rp] = s0, 8
    336  1.1  mrg 	cmp.ltu		p8, p9 = s0, pr1_1
    337  1.1  mrg 	sub		r31 = -1, acc1_1
    338  1.1  mrg 	;;	C 21
    339  1.1  mrg 	.pred.rel "mutex", p8, p9
    340  1.1  mrg   (p8)	add		acc0 = pr1_2, acc1_1, 1
    341  1.1  mrg   (p9)	add		acc0 = pr1_2, acc1_1
    342  1.1  mrg   (p8)	cmp.leu		p10, p0 = r31, pr1_2
    343  1.1  mrg   (p9)	cmp.ltu		p10, p0 = r31, pr1_2
    344  1.1  mrg 	;;	C 22
    345  1.1  mrg 	st8		[rp] = acc0, 8
    346  1.1  mrg 	mov.i		ar.lc = r2
    347  1.1  mrg   (p10)	add		r8 = 1, r8
    348  1.1  mrg 	br.ret.sptk.many b0
    349  1.1  mrg 
    350  1.1  mrg 
    351  1.1  mrg .grt2:	ldf8		u_3 = [up], 8
    352  1.1  mrg 	mov		acc1_0 = 0
    353  1.1  mrg 	mov		pr1_0 = 0
    354  1.1  mrg 	;;
    355  1.1  mrg 	mov		pr0_1 = 0
    356  1.1  mrg 	xma.l		fp0b_1 = ux, v0, f0
    357  1.1  mrg 	ldf8		u_0 = [up], 8
    358  1.1  mrg 	xma.hu		fp1a_1 = ux, v0, f0
    359  1.1  mrg 	;;
    360  1.1  mrg 	xma.l		fp0b_2 = uy, v0, f0
    361  1.1  mrg 	xma.hu		fp1a_2 = uy, v0, f0
    362  1.1  mrg 	;;
    363  1.1  mrg 	getf.sig	acc0 = fp0b_1
    364  1.1  mrg 	xma.l		fp1b_1 = ux, v1, fp1a_1
    365  1.1  mrg 	xma.hu		fp2a_1 = ux, v1, fp1a_1
    366  1.1  mrg 	;;
    367  1.1  mrg 	ldf8		u_1 = [up], 8
    368  1.1  mrg 	xma.l		fp0b_3 = u_3, v0, f0
    369  1.1  mrg 	xma.hu		fp1a_3 = u_3, v0, f0
    370  1.1  mrg 	;;
    371  1.1  mrg 	getf.sig	pr0_2 = fp0b_2
    372  1.1  mrg 	xma.l		fp1b_2 = uy, v1, fp1a_2
    373  1.1  mrg 	xma.hu		fp2a_2 = uy, v1, fp1a_2
    374  1.1  mrg 	;;
    375  1.1  mrg 	ldf8		u_2 = [up], 8
    376  1.1  mrg 	getf.sig	pr1_1 = fp1b_1
    377  1.1  mrg 	;;
    378  1.1  mrg 	getf.sig	acc1_1 = fp2a_1
    379  1.1  mrg 	xma.l		fp0b_0 = u_0, v0, f0
    380  1.1  mrg 	cmp.ne		p8, p9 = r0, r0
    381  1.1  mrg 	cmp.ne		p12, p13 = r0, r0
    382  1.1  mrg 	xma.hu		fp1a_0 = u_0, v0, f0
    383  1.1  mrg 	br		.LL10
    384  1.1  mrg 
    385  1.1  mrg 
    386  1.1  mrg 	ALIGN(32)
    387  1.1  mrg .Lb11:	mov		acc1_3 = 0
    388  1.1  mrg 	mov		pr1_3 = 0
    389  1.1  mrg 	mov		pr0_0 = 0
    390  1.1  mrg 	cmp.ne		p6, p7 = r0, r0
    391  1.1  mrg 	;;
    392  1.1  mrg 	ldf8		u_2 = [up], 8
    393  1.1  mrg 	br.cloop.dptk	.grt3
    394  1.1  mrg 	;;
    395  1.1  mrg 	xma.l		fp0b_0 = ux, v0, f0
    396  1.1  mrg 	xma.hu		fp1a_0 = ux, v0, f0
    397  1.1  mrg 	;;
    398  1.1  mrg 	cmp.ne		p10, p11 = r0, r0
    399  1.1  mrg 	xma.l		fp0b_1 = uy, v0, f0
    400  1.1  mrg 	xma.hu		fp1a_1 = uy, v0, f0
    401  1.1  mrg 	;;
    402  1.1  mrg 	getf.sig	acc0 = fp0b_0
    403  1.1  mrg 	xma.l		fp1b_0 = ux, v1, fp1a_0
    404  1.1  mrg 	xma.hu		fp2a_0 = ux, v1, fp1a_0
    405  1.1  mrg 	;;
    406  1.1  mrg 	xma.l		fp0b_2 = u_2, v0, f0
    407  1.1  mrg 	xma.hu		fp1a_2 = u_2, v0, f0
    408  1.1  mrg 	;;
    409  1.1  mrg 	getf.sig	pr0_1 = fp0b_1
    410  1.1  mrg 	xma.l		fp1b_1 = uy, v1, fp1a_1
    411  1.1  mrg 	xma.hu		fp2a_1 = uy, v1, fp1a_1
    412  1.1  mrg 	;;
    413  1.1  mrg 	getf.sig	pr1_0 = fp1b_0
    414  1.1  mrg 	getf.sig	acc1_0 = fp2a_0
    415  1.1  mrg 	br		.Lcj3
    416  1.1  mrg 
    417  1.1  mrg .grt3:	xma.l		fp0b_0 = ux, v0, f0
    418  1.1  mrg 	cmp.ne		p10, p11 = r0, r0
    419  1.1  mrg 	ldf8		u_3 = [up], 8
    420  1.1  mrg 	xma.hu		fp1a_0 = ux, v0, f0
    421  1.1  mrg 	;;
    422  1.1  mrg 	xma.l		fp0b_1 = uy, v0, f0
    423  1.1  mrg 	xma.hu		fp1a_1 = uy, v0, f0
    424  1.1  mrg 	;;
    425  1.1  mrg 	getf.sig	acc0 = fp0b_0
    426  1.1  mrg 	xma.l		fp1b_0 = ux, v1, fp1a_0
    427  1.1  mrg 	ldf8		u_0 = [up], 8
    428  1.1  mrg 	xma.hu		fp2a_0 = ux, v1, fp1a_0
    429  1.1  mrg 	;;
    430  1.1  mrg 	xma.l		fp0b_2 = u_2, v0, f0
    431  1.1  mrg 	xma.hu		fp1a_2 = u_2, v0, f0
    432  1.1  mrg 	;;
    433  1.1  mrg 	getf.sig	pr0_1 = fp0b_1
    434  1.1  mrg 	xma.l		fp1b_1 = uy, v1, fp1a_1
    435  1.1  mrg 	xma.hu		fp2a_1 = uy, v1, fp1a_1
    436  1.1  mrg 	;;
    437  1.1  mrg 	ldf8		u_1 = [up], 8
    438  1.1  mrg 	getf.sig	pr1_0 = fp1b_0
    439  1.1  mrg 	;;
    440  1.1  mrg 	getf.sig	acc1_0 = fp2a_0
    441  1.1  mrg 	xma.l		fp0b_3 = u_3, v0, f0
    442  1.1  mrg 	xma.hu		fp1a_3 = u_3, v0, f0
    443  1.1  mrg 	br		.LL11
    444  1.1  mrg 
    445  1.1  mrg 
    446  1.1  mrg C *** MAIN LOOP START ***
    447  1.1  mrg 	ALIGN(32)
    448  1.1  mrg .Loop:						C 00
    449  1.1  mrg 	.pred.rel "mutex", p12, p13
    450  1.1  mrg 	getf.sig	pr0_3 = fp0b_3
    451  1.1  mrg 	xma.l		fp1b_3 = u_3, v1, fp1a_3
    452  1.1  mrg   (p12)	add		s0 = pr1_0, acc0, 1
    453  1.1  mrg   (p13)	add		s0 = pr1_0, acc0
    454  1.1  mrg 	xma.hu		fp2a_3 = u_3, v1, fp1a_3
    455  1.1  mrg 	;;					C 01
    456  1.1  mrg 	.pred.rel "mutex", p8, p9
    457  1.1  mrg 	.pred.rel "mutex", p12, p13
    458  1.1  mrg 	ldf8		u_3 = [up], 8
    459  1.1  mrg 	getf.sig	pr1_2 = fp1b_2
    460  1.1  mrg   (p8)	cmp.leu		p6, p7 = acc0, pr0_1
    461  1.1  mrg   (p9)	cmp.ltu		p6, p7 = acc0, pr0_1
    462  1.1  mrg   (p12)	cmp.leu		p10, p11 = s0, pr1_0
    463  1.1  mrg   (p13)	cmp.ltu		p10, p11 = s0, pr1_0
    464  1.1  mrg 	;;					C 02
    465  1.1  mrg 	.pred.rel "mutex", p6, p7
    466  1.1  mrg 	getf.sig	acc1_2 = fp2a_2
    467  1.1  mrg 	st8		[rp] = s0, 8
    468  1.1  mrg 	xma.l		fp0b_1 = u_1, v0, f0
    469  1.1  mrg   (p6)	add		acc0 = pr0_2, acc1_0, 1
    470  1.1  mrg   (p7)	add		acc0 = pr0_2, acc1_0
    471  1.1  mrg 	xma.hu		fp1a_1 = u_1, v0, f0
    472  1.1  mrg 	;;					C 03
    473  1.1  mrg .LL01:
    474  1.1  mrg 	.pred.rel "mutex", p10, p11
    475  1.1  mrg 	getf.sig	pr0_0 = fp0b_0
    476  1.1  mrg 	xma.l		fp1b_0 = u_0, v1, fp1a_0
    477  1.1  mrg   (p10)	add		s0 = pr1_1, acc0, 1
    478  1.1  mrg   (p11)	add		s0 = pr1_1, acc0
    479  1.1  mrg 	xma.hu		fp2a_0 = u_0, v1, fp1a_0
    480  1.1  mrg 	;;					C 04
    481  1.1  mrg 	.pred.rel "mutex", p6, p7
    482  1.1  mrg 	.pred.rel "mutex", p10, p11
    483  1.1  mrg 	ldf8		u_0 = [up], 8
    484  1.1  mrg 	getf.sig	pr1_3 = fp1b_3
    485  1.1  mrg   (p6)	cmp.leu		p8, p9 = acc0, pr0_2
    486  1.1  mrg   (p7)	cmp.ltu		p8, p9 = acc0, pr0_2
    487  1.1  mrg   (p10)	cmp.leu		p12, p13 = s0, pr1_1
    488  1.1  mrg   (p11)	cmp.ltu		p12, p13 = s0, pr1_1
    489  1.1  mrg 	;;					C 05
    490  1.1  mrg 	.pred.rel "mutex", p8, p9
    491  1.1  mrg 	getf.sig	acc1_3 = fp2a_3
    492  1.1  mrg 	st8		[rp] = s0, 8
    493  1.1  mrg 	xma.l		fp0b_2 = u_2, v0, f0
    494  1.1  mrg   (p8)	add		acc0 = pr0_3, acc1_1, 1
    495  1.1  mrg   (p9)	add		acc0 = pr0_3, acc1_1
    496  1.1  mrg 	xma.hu		fp1a_2 = u_2, v0, f0
    497  1.1  mrg 	;;					C 06
    498  1.1  mrg .LL00:
    499  1.1  mrg 	.pred.rel "mutex", p12, p13
    500  1.1  mrg 	getf.sig	pr0_1 = fp0b_1
    501  1.1  mrg 	xma.l		fp1b_1 = u_1, v1, fp1a_1
    502  1.1  mrg   (p12)	add		s0 = pr1_2, acc0, 1
    503  1.1  mrg   (p13)	add		s0 = pr1_2, acc0
    504  1.1  mrg 	xma.hu		fp2a_1 = u_1, v1, fp1a_1
    505  1.1  mrg 	;;					C 07
    506  1.1  mrg 	.pred.rel "mutex", p8, p9
    507  1.1  mrg 	.pred.rel "mutex", p12, p13
    508  1.1  mrg 	ldf8		u_1 = [up], 8
    509  1.1  mrg 	getf.sig	pr1_0 = fp1b_0
    510  1.1  mrg   (p8)	cmp.leu		p6, p7 = acc0, pr0_3
    511  1.1  mrg   (p9)	cmp.ltu		p6, p7 = acc0, pr0_3
    512  1.1  mrg   (p12)	cmp.leu		p10, p11 = s0, pr1_2
    513  1.1  mrg   (p13)	cmp.ltu		p10, p11 = s0, pr1_2
    514  1.1  mrg 	;;					C 08
    515  1.1  mrg 	.pred.rel "mutex", p6, p7
    516  1.1  mrg 	getf.sig	acc1_0 = fp2a_0
    517  1.1  mrg 	st8		[rp] = s0, 8
    518  1.1  mrg 	xma.l		fp0b_3 = u_3, v0, f0
    519  1.1  mrg   (p6)	add		acc0 = pr0_0, acc1_2, 1
    520  1.1  mrg   (p7)	add		acc0 = pr0_0, acc1_2
    521  1.1  mrg 	xma.hu		fp1a_3 = u_3, v0, f0
    522  1.1  mrg 	;;					C 09
    523  1.1  mrg .LL11:
    524  1.1  mrg 	.pred.rel "mutex", p10, p11
    525  1.1  mrg 	getf.sig	pr0_2 = fp0b_2
    526  1.1  mrg 	xma.l		fp1b_2 = u_2, v1, fp1a_2
    527  1.1  mrg   (p10)	add		s0 = pr1_3, acc0, 1
    528  1.1  mrg   (p11)	add		s0 = pr1_3, acc0
    529  1.1  mrg 	xma.hu		fp2a_2 = u_2, v1, fp1a_2
    530  1.1  mrg 	;;					C 10
    531  1.1  mrg 	.pred.rel "mutex", p6, p7
    532  1.1  mrg 	.pred.rel "mutex", p10, p11
    533  1.1  mrg 	ldf8		u_2 = [up], 8
    534  1.1  mrg 	getf.sig	pr1_1 = fp1b_1
    535  1.1  mrg   (p6)	cmp.leu		p8, p9 = acc0, pr0_0
    536  1.1  mrg   (p7)	cmp.ltu		p8, p9 = acc0, pr0_0
    537  1.1  mrg   (p10)	cmp.leu		p12, p13 = s0, pr1_3
    538  1.1  mrg   (p11)	cmp.ltu		p12, p13 = s0, pr1_3
    539  1.1  mrg 	;;					C 11
    540  1.1  mrg 	.pred.rel "mutex", p8, p9
    541  1.1  mrg 	getf.sig	acc1_1 = fp2a_1
    542  1.1  mrg 	st8		[rp] = s0, 8
    543  1.1  mrg 	xma.l		fp0b_0 = u_0, v0, f0
    544  1.1  mrg   (p8)	add		acc0 = pr0_1, acc1_3, 1
    545  1.1  mrg   (p9)	add		acc0 = pr0_1, acc1_3
    546  1.1  mrg 	xma.hu		fp1a_0 = u_0, v0, f0
    547  1.1  mrg .LL10:	br.cloop.dptk	.Loop			C 12
    548  1.1  mrg 	;;
    549  1.1  mrg C *** MAIN LOOP END ***
    550  1.1  mrg 
    551  1.1  mrg .Lcj6:
    552  1.1  mrg 	.pred.rel "mutex", p12, p13
    553  1.1  mrg 	getf.sig	pr0_3 = fp0b_3
    554  1.1  mrg 	xma.l		fp1b_3 = u_3, v1, fp1a_3
    555  1.1  mrg   (p12)	add		s0 = pr1_0, acc0, 1
    556  1.1  mrg   (p13)	add		s0 = pr1_0, acc0
    557  1.1  mrg 	xma.hu		fp2a_3 = u_3, v1, fp1a_3
    558  1.1  mrg 	;;
    559  1.1  mrg 	.pred.rel "mutex", p8, p9
    560  1.1  mrg 	.pred.rel "mutex", p12, p13
    561  1.1  mrg 	getf.sig	pr1_2 = fp1b_2
    562  1.1  mrg   (p8)	cmp.leu		p6, p7 = acc0, pr0_1
    563  1.1  mrg   (p9)	cmp.ltu		p6, p7 = acc0, pr0_1
    564  1.1  mrg   (p12)	cmp.leu		p10, p11 = s0, pr1_0
    565  1.1  mrg   (p13)	cmp.ltu		p10, p11 = s0, pr1_0
    566  1.1  mrg 	;;
    567  1.1  mrg 	.pred.rel "mutex", p6, p7
    568  1.1  mrg 	getf.sig	acc1_2 = fp2a_2
    569  1.1  mrg 	st8		[rp] = s0, 8
    570  1.1  mrg 	xma.l		fp0b_1 = u_1, v0, f0
    571  1.1  mrg   (p6)	add		acc0 = pr0_2, acc1_0, 1
    572  1.1  mrg   (p7)	add		acc0 = pr0_2, acc1_0
    573  1.1  mrg 	xma.hu		fp1a_1 = u_1, v0, f0
    574  1.1  mrg 	;;
    575  1.1  mrg .Lcj5:
    576  1.1  mrg 	.pred.rel "mutex", p10, p11
    577  1.1  mrg 	getf.sig	pr0_0 = fp0b_0
    578  1.1  mrg 	xma.l		fp1b_0 = u_0, v1, fp1a_0
    579  1.1  mrg   (p10)	add		s0 = pr1_1, acc0, 1
    580  1.1  mrg   (p11)	add		s0 = pr1_1, acc0
    581  1.1  mrg 	xma.hu		fp2a_0 = u_0, v1, fp1a_0
    582  1.1  mrg 	;;
    583  1.1  mrg 	.pred.rel "mutex", p6, p7
    584  1.1  mrg 	.pred.rel "mutex", p10, p11
    585  1.1  mrg 	getf.sig	pr1_3 = fp1b_3
    586  1.1  mrg   (p6)	cmp.leu		p8, p9 = acc0, pr0_2
    587  1.1  mrg   (p7)	cmp.ltu		p8, p9 = acc0, pr0_2
    588  1.1  mrg   (p10)	cmp.leu		p12, p13 = s0, pr1_1
    589  1.1  mrg   (p11)	cmp.ltu		p12, p13 = s0, pr1_1
    590  1.1  mrg 	;;
    591  1.1  mrg 	.pred.rel "mutex", p8, p9
    592  1.1  mrg 	getf.sig	acc1_3 = fp2a_3
    593  1.1  mrg 	st8		[rp] = s0, 8
    594  1.1  mrg 	xma.l		fp0b_2 = u_2, v0, f0
    595  1.1  mrg   (p8)	add		acc0 = pr0_3, acc1_1, 1
    596  1.1  mrg   (p9)	add		acc0 = pr0_3, acc1_1
    597  1.1  mrg 	xma.hu		fp1a_2 = u_2, v0, f0
    598  1.1  mrg 	;;
    599  1.1  mrg .Lcj4:
    600  1.1  mrg 	.pred.rel "mutex", p12, p13
    601  1.1  mrg 	getf.sig	pr0_1 = fp0b_1
    602  1.1  mrg 	xma.l		fp1b_1 = u_1, v1, fp1a_1
    603  1.1  mrg   (p12)	add		s0 = pr1_2, acc0, 1
    604  1.1  mrg   (p13)	add		s0 = pr1_2, acc0
    605  1.1  mrg 	xma.hu		fp2a_1 = u_1, v1, fp1a_1
    606  1.1  mrg 	;;
    607  1.1  mrg 	.pred.rel "mutex", p8, p9
    608  1.1  mrg 	.pred.rel "mutex", p12, p13
    609  1.1  mrg 	getf.sig	pr1_0 = fp1b_0
    610  1.1  mrg   (p8)	cmp.leu		p6, p7 = acc0, pr0_3
    611  1.1  mrg   (p9)	cmp.ltu		p6, p7 = acc0, pr0_3
    612  1.1  mrg   (p12)	cmp.leu		p10, p11 = s0, pr1_2
    613  1.1  mrg   (p13)	cmp.ltu		p10, p11 = s0, pr1_2
    614  1.1  mrg 	;;
    615  1.1  mrg 	.pred.rel "mutex", p6, p7
    616  1.1  mrg 	getf.sig	acc1_0 = fp2a_0
    617  1.1  mrg 	st8		[rp] = s0, 8
    618  1.1  mrg   (p6)	add		acc0 = pr0_0, acc1_2, 1
    619  1.1  mrg   (p7)	add		acc0 = pr0_0, acc1_2
    620  1.1  mrg 	;;
    621  1.1  mrg .Lcj3:
    622  1.1  mrg 	.pred.rel "mutex", p10, p11
    623  1.1  mrg 	getf.sig	pr0_2 = fp0b_2
    624  1.1  mrg 	xma.l		fp1b_2 = u_2, v1, fp1a_2
    625  1.1  mrg   (p10)	add		s0 = pr1_3, acc0, 1
    626  1.1  mrg   (p11)	add		s0 = pr1_3, acc0
    627  1.1  mrg 	xma.hu		fp2a_2 = u_2, v1, fp1a_2
    628  1.1  mrg 	;;
    629  1.1  mrg 	.pred.rel "mutex", p6, p7
    630  1.1  mrg 	.pred.rel "mutex", p10, p11
    631  1.1  mrg 	getf.sig	pr1_1 = fp1b_1
    632  1.1  mrg   (p6)	cmp.leu		p8, p9 = acc0, pr0_0
    633  1.1  mrg   (p7)	cmp.ltu		p8, p9 = acc0, pr0_0
    634  1.1  mrg   (p10)	cmp.leu		p12, p13 = s0, pr1_3
    635  1.1  mrg   (p11)	cmp.ltu		p12, p13 = s0, pr1_3
    636  1.1  mrg 	;;
    637  1.1  mrg 	.pred.rel "mutex", p8, p9
    638  1.1  mrg 	getf.sig	acc1_1 = fp2a_1
    639  1.1  mrg 	st8		[rp] = s0, 8
    640  1.1  mrg   (p8)	add		acc0 = pr0_1, acc1_3, 1
    641  1.1  mrg   (p9)	add		acc0 = pr0_1, acc1_3
    642  1.1  mrg 	;;
    643  1.1  mrg 	.pred.rel "mutex", p12, p13
    644  1.1  mrg   (p12)	add		s0 = pr1_0, acc0, 1
    645  1.1  mrg   (p13)	add		s0 = pr1_0, acc0
    646  1.1  mrg 	;;
    647  1.1  mrg 	.pred.rel "mutex", p8, p9
    648  1.1  mrg 	.pred.rel "mutex", p12, p13
    649  1.1  mrg 	getf.sig	pr1_2 = fp1b_2
    650  1.1  mrg   (p8)	cmp.leu		p6, p7 = acc0, pr0_1
    651  1.1  mrg   (p9)	cmp.ltu		p6, p7 = acc0, pr0_1
    652  1.1  mrg   (p12)	cmp.leu		p10, p11 = s0, pr1_0
    653  1.1  mrg   (p13)	cmp.ltu		p10, p11 = s0, pr1_0
    654  1.1  mrg 	;;
    655  1.1  mrg 	.pred.rel "mutex", p6, p7
    656  1.1  mrg 	getf.sig	acc1_2 = fp2a_2
    657  1.1  mrg 	st8		[rp] = s0, 8
    658  1.1  mrg   (p6)	add		acc0 = pr0_2, acc1_0, 1
    659  1.1  mrg   (p7)	add		acc0 = pr0_2, acc1_0
    660  1.1  mrg 	;;
    661  1.1  mrg 	.pred.rel "mutex", p10, p11
    662  1.1  mrg   (p10)	add		s0 = pr1_1, acc0, 1
    663  1.1  mrg   (p11)	add		s0 = pr1_1, acc0
    664  1.1  mrg 	;;
    665  1.1  mrg 	.pred.rel "mutex", p6, p7
    666  1.1  mrg 	.pred.rel "mutex", p10, p11
    667  1.1  mrg   (p6)	cmp.leu		p8, p9 = acc0, pr0_2
    668  1.1  mrg   (p7)	cmp.ltu		p8, p9 = acc0, pr0_2
    669  1.1  mrg   (p10)	cmp.leu		p12, p13 = s0, pr1_1
    670  1.1  mrg   (p11)	cmp.ltu		p12, p13 = s0, pr1_1
    671  1.1  mrg 	;;
    672  1.1  mrg 	.pred.rel "mutex", p8, p9
    673  1.1  mrg 	st8		[rp] = s0, 8
    674  1.1  mrg   (p8)	add		acc0 = pr1_2, acc1_1, 1
    675  1.1  mrg   (p9)	add		acc0 = pr1_2, acc1_1
    676  1.1  mrg 	;;
    677  1.1  mrg 	.pred.rel "mutex", p8, p9
    678  1.1  mrg   (p8)	cmp.leu		p10, p11 = acc0, pr1_2
    679  1.1  mrg   (p9)	cmp.ltu		p10, p11 = acc0, pr1_2
    680  1.1  mrg   (p12)	add		acc0 = 1, acc0
    681  1.1  mrg 	;;
    682  1.1  mrg 	st8		[rp] = acc0, 8
    683  1.1  mrg   (p12)	cmp.eq.or	p10, p0 = 0, acc0
    684  1.1  mrg 	mov		r8 = acc1_2
    685  1.1  mrg 	;;
    686  1.1  mrg 	.pred.rel "mutex", p10, p11
    687  1.1  mrg   (p10)	add		r8 = 1, r8
    688  1.1  mrg 	mov.i		ar.lc = r2
    689  1.1  mrg 	br.ret.sptk.many b0
    690  1.1  mrg EPILOGUE()
    691  1.1  mrg ASM_END()
    692