Home | History | Annotate | Line # | Download | only in ia64
mul_2.asm revision 1.1.1.1.2.1
      1 dnl  IA-64 mpn_mul_2 -- Multiply a n-limb number with a 2-limb number and store
      2 dnl  store the result to a (n+1)-limb number.
      3 
      4 dnl  Contributed to the GNU project by Torbjorn Granlund.
      5 
      6 dnl  Copyright 2004, 2011 Free Software Foundation, Inc.
      7 
      8 dnl  This file is part of the GNU MP Library.
      9 
     10 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     11 dnl  it under the terms of the GNU Lesser General Public License as published
     12 dnl  by the Free Software Foundation; either version 3 of the License, or (at
     13 dnl  your option) any later version.
     14 
     15 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     16 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     17 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
     18 dnl  License for more details.
     19 
     20 dnl  You should have received a copy of the GNU Lesser General Public License
     21 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
     22 
     23 include(`../config.m4')
     24 
     25 C         cycles/limb
     26 C Itanium:    ?
     27 C Itanium 2:  1.5
     28 
     29 C TODO
     30 C  * Clean up variable names, and try to decrease the number of distinct
     31 C    registers used.
     32 C  * Clean up feed-in code to not require zeroing several registers.
     33 C  * Make sure we don't depend on uninitialized predicate registers.
     34 C  * Could perhaps save a few cycles by using 1 c/l carry propagation in
     35 C    wind-down code.
     36 C  * Ultimately rewrite.  The problem with this code is that it first uses a
     37 C    loaded u value in one xma pair, then leaves it live over several unrelated
     38 C    xma pairs, before it uses it again.  It should actually be quite possible
     39 C    to just swap some aligned xma pairs around.  But we should then schedule
     40 C    u loads further from the first use.
     41 
     42 C INPUT PARAMETERS
     43 define(`rp',`r32')
     44 define(`up',`r33')
     45 define(`n',`r34')
     46 define(`vp',`r35')
     47 
     48 define(`srp',`r3')
     49 
     50 define(`v0',`f6')
     51 define(`v1',`f7')
     52 
     53 define(`s0',`r14')
     54 define(`acc0',`r15')
     55 
     56 define(`pr0_0',`r16') define(`pr0_1',`r17')
     57 define(`pr0_2',`r18') define(`pr0_3',`r19')
     58 
     59 define(`pr1_0',`r20') define(`pr1_1',`r21')
     60 define(`pr1_2',`r22') define(`pr1_3',`r23')
     61 
     62 define(`acc1_0',`r24') define(`acc1_1',`r25')
     63 define(`acc1_2',`r26') define(`acc1_3',`r27')
     64 
     65 dnl define(`',`r28')
     66 dnl define(`',`r29')
     67 dnl define(`',`r30')
     68 dnl define(`',`r31')
     69 
     70 define(`fp0b_0',`f8') define(`fp0b_1',`f9')
     71 define(`fp0b_2',`f10') define(`fp0b_3',`f11')
     72 
     73 define(`fp1a_0',`f12') define(`fp1a_1',`f13')
     74 define(`fp1a_2',`f14') define(`fp1a_3',`f15')
     75 
     76 define(`fp1b_0',`f32') define(`fp1b_1',`f33')
     77 define(`fp1b_2',`f34') define(`fp1b_3',`f35')
     78 
     79 define(`fp2a_0',`f36') define(`fp2a_1',`f37')
     80 define(`fp2a_2',`f38') define(`fp2a_3',`f39')
     81 
     82 define(`u_0',`f44') define(`u_1',`f45')
     83 define(`u_2',`f46') define(`u_3',`f47')
     84 
     85 define(`ux',`f49')
     86 define(`uy',`f51')
     87 
     88 ASM_START()
     89 PROLOGUE(mpn_mul_2)
     90 	.prologue
     91 	.save	ar.lc, r2
     92 	.body
     93 
     94 ifdef(`HAVE_ABI_32',`
     95 .mmi;		addp4	rp = 0, rp		C			M I
     96 		addp4	up = 0, up		C			M I
     97 		addp4	vp = 0, vp		C			M I
     98 .mmi;		nop	1
     99 		nop	1
    100 		zxt4	n = n			C			I
    101 	;;')
    102 
    103 .mmi;		ldf8	ux = [up], 8		C			M
    104 		ldf8	v0 = [vp], 8		C			M
    105 		mov	r2 = ar.lc		C			I0
    106 .mmi;		nop	1			C			M
    107 		and	r14 = 3, n		C			M I
    108 		add	n = -2, n		C			M I
    109 	;;
    110 .mmi;		ldf8	uy = [up], 8		C			M
    111 		ldf8	v1 = [vp]		C			M
    112 		shr.u	n = n, 2		C			I
    113 .mmi;		nop	1			C			M
    114 		cmp.eq	p10, p0 = 1, r14	C			M I
    115 		cmp.eq	p11, p0 = 2, r14	C			M I
    116 	;;
    117 .mmi;		nop	1			C			M
    118 		cmp.eq	p12, p0 = 3, r14	C			M I
    119 		mov	ar.lc = n		C			I0
    120 .bbb;	(p10)	br.dptk	L(b01)			C			B
    121 	(p11)	br.dptk	L(b10)			C			B
    122 	(p12)	br.dptk	L(b11)			C			B
    123 	;;
    124 
    125 	ALIGN(32)
    126 L(b00):		ldf8	u_1 = [up], 8
    127 		mov	acc1_2 = 0
    128 		mov	pr1_2 = 0
    129 		mov	pr0_3 = 0
    130 		cmp.ne	p8, p9 = r0, r0
    131 	;;
    132 		xma.l	fp0b_3 = ux, v0, f0
    133 		cmp.ne	p12, p13 = r0, r0
    134 		ldf8	u_2 = [up], 8
    135 		xma.hu	fp1a_3 = ux, v0, f0
    136 		br.cloop.dptk	L(gt4)
    137 
    138 		xma.l	fp0b_0 = uy, v0, f0
    139 		xma.hu	fp1a_0 = uy, v0, f0
    140 	;;
    141 		getfsig	acc0 = fp0b_3
    142 		xma.l	fp1b_3 = ux, v1, fp1a_3
    143 		xma.hu	fp2a_3 = ux, v1, fp1a_3
    144 	;;
    145 		xma.l	fp0b_1 = u_1, v0, f0
    146 		xma.hu	fp1a_1 = u_1, v0, f0
    147 	;;
    148 		getfsig	pr0_0 = fp0b_0
    149 		xma.l	fp1b_0 = uy, v1, fp1a_0
    150 		xma.hu	fp2a_0 = uy, v1, fp1a_0
    151 	;;
    152 		getfsig	pr1_3 = fp1b_3
    153 		getfsig	acc1_3 = fp2a_3
    154 		xma.l	fp0b_2 = u_2, v0, f0
    155 		xma.hu	fp1a_2 = u_2, v0, f0
    156 		br	L(cj4)
    157 
    158 L(gt4):		xma.l	fp0b_0 = uy, v0, f0
    159 		xma.hu	fp1a_0 = uy, v0, f0
    160 	;;
    161 		getfsig	acc0 = fp0b_3
    162 		xma.l	fp1b_3 = ux, v1, fp1a_3
    163 		ldf8	u_3 = [up], 8
    164 		xma.hu	fp2a_3 = ux, v1, fp1a_3
    165 	;;
    166 		xma.l	fp0b_1 = u_1, v0, f0
    167 		xma.hu	fp1a_1 = u_1, v0, f0
    168 	;;
    169 		getfsig	pr0_0 = fp0b_0
    170 		xma.l	fp1b_0 = uy, v1, fp1a_0
    171 		xma.hu	fp2a_0 = uy, v1, fp1a_0
    172 	;;
    173 		ldf8	u_0 = [up], 8
    174 		getfsig	pr1_3 = fp1b_3
    175 		xma.l	fp0b_2 = u_2, v0, f0
    176 	;;
    177 		getfsig	acc1_3 = fp2a_3
    178 		xma.hu	fp1a_2 = u_2, v0, f0
    179 		br	L(00)
    180 
    181 
    182 	ALIGN(32)
    183 L(b01):		ldf8	u_0 = [up], 8		C M
    184 		mov	acc1_1 = 0		C M I
    185 		mov	pr1_1 = 0		C M I
    186 		mov	pr0_2 = 0		C M I
    187 		cmp.ne	p6, p7 = r0, r0		C M I
    188 	;;
    189 		xma.l	fp0b_2 = ux, v0, f0	C F
    190 		cmp.ne	p10, p11 = r0, r0	C M I
    191 		ldf8	u_1 = [up], 8		C M
    192 		xma.hu	fp1a_2 = ux, v0, f0	C F
    193 	;;
    194 		xma.l	fp0b_3 = uy, v0, f0	C F
    195 		xma.hu	fp1a_3 = uy, v0, f0	C F
    196 	;;
    197 		getfsig	acc0 = fp0b_2		C M
    198 		xma.l	fp1b_2 = ux, v1,fp1a_2	C F
    199 		ldf8	u_2 = [up], 8		C M
    200 		xma.hu	fp2a_2 = ux, v1,fp1a_2	C F
    201 		br.cloop.dptk	L(gt5)
    202 
    203 		xma.l	fp0b_0 = u_0, v0, f0	C F
    204 		xma.hu	fp1a_0 = u_0, v0, f0	C F
    205 	;;
    206 		getfsig	pr0_3 = fp0b_3		C M
    207 		xma.l	fp1b_3 = uy, v1,fp1a_3	C F
    208 		xma.hu	fp2a_3 = uy, v1,fp1a_3	C F
    209 	;;
    210 		getfsig	pr1_2 = fp1b_2		C M
    211 		getfsig	acc1_2 = fp2a_2		C M
    212 		xma.l	fp0b_1 = u_1, v0, f0	C F
    213 		xma.hu	fp1a_1 = u_1, v0, f0	C F
    214 		br	L(cj5)
    215 
    216 L(gt5):		xma.l	fp0b_0 = u_0, v0, f0
    217 		xma.hu	fp1a_0 = u_0, v0, f0
    218 	;;
    219 		getfsig	pr0_3 = fp0b_3
    220 		xma.l	fp1b_3 = uy, v1, fp1a_3
    221 		xma.hu	fp2a_3 = uy, v1, fp1a_3
    222 	;;
    223 		ldf8	u_3 = [up], 8
    224 		getfsig	pr1_2 = fp1b_2
    225 		xma.l	fp0b_1 = u_1, v0, f0
    226 	;;
    227 		getfsig	acc1_2 = fp2a_2
    228 		xma.hu	fp1a_1 = u_1, v0, f0
    229 		br	L(01)
    230 
    231 
    232 	ALIGN(32)
    233 L(b10):		br.cloop.dptk	L(gt2)
    234 		xma.l	fp0b_1 = ux, v0, f0
    235 		xma.hu	fp1a_1 = ux, v0, f0
    236 	;;
    237 		xma.l	fp0b_2 = uy, v0, f0
    238 		xma.hu	fp1a_2 = uy, v0, f0
    239 	;;
    240 		stf8	[rp] = fp0b_1, 8
    241 		xma.l	fp1b_1 = ux, v1, fp1a_1
    242 		xma.hu	fp2a_1 = ux, v1, fp1a_1
    243 	;;
    244 		getfsig	acc0 = fp0b_2
    245 		xma.l	fp1b_2 = uy, v1, fp1a_2
    246 		xma.hu	fp2a_2 = uy, v1, fp1a_2
    247 	;;
    248 		getfsig	pr1_1 = fp1b_1
    249 		getfsig	acc1_1 = fp2a_1
    250 		mov	ar.lc = r2
    251 		getfsig	pr1_2 = fp1b_2
    252 		getfsig	r8 = fp2a_2
    253 	;;
    254 		add	s0 = pr1_1, acc0
    255 	;;
    256 		st8	[rp] = s0, 8
    257 		cmp.ltu	p8, p9 = s0, pr1_1
    258 		sub	r31 = -1, acc1_1
    259 	;;
    260 		.pred.rel "mutex", p8, p9
    261 	(p8)	add	acc0 = pr1_2, acc1_1, 1
    262 	(p9)	add	acc0 = pr1_2, acc1_1
    263 	(p8)	cmp.leu	p10, p0 = r31, pr1_2
    264 	(p9)	cmp.ltu	p10, p0 = r31, pr1_2
    265 	;;
    266 		st8	[rp] = acc0, 8
    267 	(p10)	add	r8 = 1, r8
    268 		br.ret.sptk.many b0
    269 
    270 L(gt2):		ldf8	u_3 = [up], 8
    271 		mov	acc1_0 = 0
    272 		mov	pr1_0 = 0
    273 	;;
    274 		mov	pr0_1 = 0
    275 		xma.l	fp0b_1 = ux, v0, f0
    276 		ldf8	u_0 = [up], 8
    277 		xma.hu	fp1a_1 = ux, v0, f0
    278 	;;
    279 		xma.l	fp0b_2 = uy, v0, f0
    280 		xma.hu	fp1a_2 = uy, v0, f0
    281 	;;
    282 		getfsig	acc0 = fp0b_1
    283 		xma.l	fp1b_1 = ux, v1, fp1a_1
    284 		xma.hu	fp2a_1 = ux, v1, fp1a_1
    285 	;;
    286 		ldf8	u_1 = [up], 8
    287 		xma.l	fp0b_3 = u_3, v0, f0
    288 		xma.hu	fp1a_3 = u_3, v0, f0
    289 	;;
    290 		getfsig	pr0_2 = fp0b_2
    291 		xma.l	fp1b_2 = uy, v1, fp1a_2
    292 		xma.hu	fp2a_2 = uy, v1, fp1a_2
    293 	;;
    294 		ldf8	u_2 = [up], 8
    295 		getfsig	pr1_1 = fp1b_1
    296 	;;
    297 .mfi;		getfsig	acc1_1 = fp2a_1
    298 		xma.l	fp0b_0 = u_0, v0, f0
    299 		cmp.ne	p8, p9 = r0, r0
    300 .mfb;		cmp.ne	p12, p13 = r0, r0
    301 		xma.hu	fp1a_0 = u_0, v0, f0
    302 		br	L(10)
    303 
    304 
    305 	ALIGN(32)
    306 L(b11):		mov	acc1_3 = 0
    307 		mov	pr1_3 = 0
    308 		mov	pr0_0 = 0
    309 		ldf8	u_2 = [up], 8
    310 		cmp.ne	p6, p7 = r0, r0
    311 		br.cloop.dptk	L(gt3)
    312 	;;
    313 		xma.l	fp0b_0 = ux, v0, f0
    314 		xma.hu	fp1a_0 = ux, v0, f0
    315 	;;
    316 		cmp.ne	p10, p11 = r0, r0
    317 		xma.l	fp0b_1 = uy, v0, f0
    318 		xma.hu	fp1a_1 = uy, v0, f0
    319 	;;
    320 		getfsig	acc0 = fp0b_0
    321 		xma.l	fp1b_0 = ux, v1, fp1a_0
    322 		xma.hu	fp2a_0 = ux, v1, fp1a_0
    323 	;;
    324 		xma.l	fp0b_2 = u_2, v0, f0
    325 		xma.hu	fp1a_2 = u_2, v0, f0
    326 	;;
    327 		getfsig	pr0_1 = fp0b_1
    328 		xma.l	fp1b_1 = uy, v1, fp1a_1
    329 		xma.hu	fp2a_1 = uy, v1, fp1a_1
    330 	;;
    331 		getfsig	pr1_0 = fp1b_0
    332 		getfsig	acc1_0 = fp2a_0
    333 		br	L(cj3)
    334 
    335 L(gt3):		xma.l	fp0b_0 = ux, v0, f0
    336 		cmp.ne	p10, p11 = r0, r0
    337 		ldf8	u_3 = [up], 8
    338 		xma.hu	fp1a_0 = ux, v0, f0
    339 	;;
    340 		xma.l	fp0b_1 = uy, v0, f0
    341 		xma.hu	fp1a_1 = uy, v0, f0
    342 	;;
    343 		getfsig	acc0 = fp0b_0
    344 		xma.l	fp1b_0 = ux, v1, fp1a_0
    345 		ldf8	u_0 = [up], 8
    346 		xma.hu	fp2a_0 = ux, v1, fp1a_0
    347 	;;
    348 		xma.l	fp0b_2 = u_2, v0, f0
    349 		xma.hu	fp1a_2 = u_2, v0, f0
    350 	;;
    351 		getfsig	pr0_1 = fp0b_1
    352 		xma.l	fp1b_1 = uy, v1, fp1a_1
    353 		xma.hu	fp2a_1 = uy, v1, fp1a_1
    354 	;;
    355 		ldf8	u_1 = [up], 8
    356 		getfsig	pr1_0 = fp1b_0
    357 	;;
    358 		getfsig	acc1_0 = fp2a_0
    359 		xma.l	fp0b_3 = u_3, v0, f0
    360 		xma.hu	fp1a_3 = u_3, v0, f0
    361 		br	L(11)
    362 
    363 
    364 C *** MAIN LOOP START ***
    365 	ALIGN(32)
    366 L(top):						C 00
    367 		.pred.rel "mutex", p8, p9
    368 		.pred.rel "mutex", p12, p13
    369 		ldf8	u_3 = [up], 8
    370 		getfsig	pr1_2 = fp1b_2
    371 	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
    372 	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
    373 	(p12)	cmp.leu	p10, p11 = s0, pr1_0
    374 	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
    375 	;;					C 01
    376 		.pred.rel "mutex", p6, p7
    377 		getfsig	acc1_2 = fp2a_2
    378 		st8	[rp] = s0, 8
    379 		xma.l	fp0b_1 = u_1, v0, f0
    380 	(p6)	add	acc0 = pr0_2, acc1_0, 1
    381 	(p7)	add	acc0 = pr0_2, acc1_0
    382 		xma.hu	fp1a_1 = u_1, v0, f0
    383 	;;					C 02
    384 L(01):
    385 		.pred.rel "mutex", p10, p11
    386 		getfsig	pr0_0 = fp0b_0
    387 		xma.l	fp1b_0 = u_0, v1, fp1a_0
    388 	(p10)	add	s0 = pr1_1, acc0, 1
    389 	(p11)	add	s0 = pr1_1, acc0
    390 		xma.hu	fp2a_0 = u_0, v1, fp1a_0
    391 		nop	1
    392 	;;					C 03
    393 		.pred.rel "mutex", p6, p7
    394 		.pred.rel "mutex", p10, p11
    395 		ldf8	u_0 = [up], 8
    396 		getfsig	pr1_3 = fp1b_3
    397 	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
    398 	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
    399 	(p10)	cmp.leu	p12, p13 = s0, pr1_1
    400 	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
    401 	;;					C 04
    402 		.pred.rel "mutex", p8, p9
    403 		getfsig	acc1_3 = fp2a_3
    404 		st8	[rp] = s0, 8
    405 		xma.l	fp0b_2 = u_2, v0, f0
    406 	(p8)	add	acc0 = pr0_3, acc1_1, 1
    407 	(p9)	add	acc0 = pr0_3, acc1_1
    408 		xma.hu	fp1a_2 = u_2, v0, f0
    409 	;;					C 05
    410 L(00):
    411 		.pred.rel "mutex", p12, p13
    412 		getfsig	pr0_1 = fp0b_1
    413 		xma.l	fp1b_1 = u_1, v1, fp1a_1
    414 	(p12)	add	s0 = pr1_2, acc0, 1
    415 	(p13)	add	s0 = pr1_2, acc0
    416 		xma.hu	fp2a_1 = u_1, v1, fp1a_1
    417 		nop	1
    418 	;;					C 06
    419 		.pred.rel "mutex", p8, p9
    420 		.pred.rel "mutex", p12, p13
    421 		ldf8	u_1 = [up], 8
    422 		getfsig	pr1_0 = fp1b_0
    423 	(p8)	cmp.leu	p6, p7 = acc0, pr0_3
    424 	(p9)	cmp.ltu	p6, p7 = acc0, pr0_3
    425 	(p12)	cmp.leu	p10, p11 = s0, pr1_2
    426 	(p13)	cmp.ltu	p10, p11 = s0, pr1_2
    427 	;;					C 07
    428 		.pred.rel "mutex", p6, p7
    429 		getfsig	acc1_0 = fp2a_0
    430 		st8	[rp] = s0, 8
    431 		xma.l	fp0b_3 = u_3, v0, f0
    432 	(p6)	add	acc0 = pr0_0, acc1_2, 1
    433 	(p7)	add	acc0 = pr0_0, acc1_2
    434 		xma.hu	fp1a_3 = u_3, v0, f0
    435 	;;					C 08
    436 L(11):
    437 		.pred.rel "mutex", p10, p11
    438 		getfsig	pr0_2 = fp0b_2
    439 		xma.l	fp1b_2 = u_2, v1, fp1a_2
    440 	(p10)	add	s0 = pr1_3, acc0, 1
    441 	(p11)	add	s0 = pr1_3, acc0
    442 		xma.hu	fp2a_2 = u_2, v1, fp1a_2
    443 		nop	1
    444 	;;					C 09
    445 		.pred.rel "mutex", p6, p7
    446 		.pred.rel "mutex", p10, p11
    447 		ldf8	u_2 = [up], 8
    448 		getfsig	pr1_1 = fp1b_1
    449 	(p6)	cmp.leu	p8, p9 = acc0, pr0_0
    450 	(p7)	cmp.ltu	p8, p9 = acc0, pr0_0
    451 	(p10)	cmp.leu	p12, p13 = s0, pr1_3
    452 	(p11)	cmp.ltu	p12, p13 = s0, pr1_3
    453 	;;					C 10
    454 		.pred.rel "mutex", p8, p9
    455 		getfsig	acc1_1 = fp2a_1
    456 		st8	[rp] = s0, 8
    457 		xma.l	fp0b_0 = u_0, v0, f0
    458 	(p8)	add	acc0 = pr0_1, acc1_3, 1
    459 	(p9)	add	acc0 = pr0_1, acc1_3
    460 		xma.hu	fp1a_0 = u_0, v0, f0
    461 	;;					C 11
    462 L(10):
    463 		.pred.rel "mutex", p12, p13
    464 		getfsig	pr0_3 = fp0b_3
    465 		xma.l	fp1b_3 = u_3, v1, fp1a_3
    466 	(p12)	add	s0 = pr1_0, acc0, 1
    467 	(p13)	add	s0 = pr1_0, acc0
    468 		xma.hu	fp2a_3 = u_3, v1, fp1a_3
    469 		br.cloop.dptk	L(top)
    470 	;;
    471 C *** MAIN LOOP END ***
    472 
    473 		.pred.rel "mutex", p8, p9
    474 		.pred.rel "mutex", p12, p13
    475 .mmi;		getfsig	pr1_2 = fp1b_2
    476 		st8	[rp] = s0, 8
    477 	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
    478 .mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
    479 	(p12)	cmp.leu	p10, p11 = s0, pr1_0
    480 	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
    481 	;;
    482 		.pred.rel "mutex", p6, p7
    483 .mfi;		getfsig	acc1_2 = fp2a_2
    484 		xma.l	fp0b_1 = u_1, v0, f0
    485 		nop	1
    486 .mmf;	(p6)	add	acc0 = pr0_2, acc1_0, 1
    487 	(p7)	add	acc0 = pr0_2, acc1_0
    488 		xma.hu	fp1a_1 = u_1, v0, f0
    489 	;;
    490 L(cj5):
    491 		.pred.rel "mutex", p10, p11
    492 .mfi;		getfsig	pr0_0 = fp0b_0
    493 		xma.l	fp1b_0 = u_0, v1, fp1a_0
    494 	(p10)	add	s0 = pr1_1, acc0, 1
    495 .mfi;	(p11)	add	s0 = pr1_1, acc0
    496 		xma.hu	fp2a_0 = u_0, v1, fp1a_0
    497 		nop	1
    498 	;;
    499 		.pred.rel "mutex", p6, p7
    500 		.pred.rel "mutex", p10, p11
    501 .mmi;		getfsig	pr1_3 = fp1b_3
    502 		st8	[rp] = s0, 8
    503 	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
    504 .mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
    505 	(p10)	cmp.leu	p12, p13 = s0, pr1_1
    506 	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
    507 	;;
    508 		.pred.rel "mutex", p8, p9
    509 .mfi;		getfsig	acc1_3 = fp2a_3
    510 		xma.l	fp0b_2 = u_2, v0, f0
    511 		nop	1
    512 .mmf;	(p8)	add	acc0 = pr0_3, acc1_1, 1
    513 	(p9)	add	acc0 = pr0_3, acc1_1
    514 		xma.hu	fp1a_2 = u_2, v0, f0
    515 	;;
    516 L(cj4):
    517 		.pred.rel "mutex", p12, p13
    518 .mfi;		getfsig	pr0_1 = fp0b_1
    519 		xma.l	fp1b_1 = u_1, v1, fp1a_1
    520 	(p12)	add	s0 = pr1_2, acc0, 1
    521 .mfi;	(p13)	add	s0 = pr1_2, acc0
    522 		xma.hu	fp2a_1 = u_1, v1, fp1a_1
    523 		nop	1
    524 	;;
    525 		.pred.rel "mutex", p8, p9
    526 		.pred.rel "mutex", p12, p13
    527 .mmi;		getfsig	pr1_0 = fp1b_0
    528 		st8	[rp] = s0, 8
    529 	(p8)	cmp.leu	p6, p7 = acc0, pr0_3
    530 .mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_3
    531 	(p12)	cmp.leu	p10, p11 = s0, pr1_2
    532 	(p13)	cmp.ltu	p10, p11 = s0, pr1_2
    533 	;;
    534 		.pred.rel "mutex", p6, p7
    535 .mmi;		getfsig	acc1_0 = fp2a_0
    536 	(p6)	add	acc0 = pr0_0, acc1_2, 1
    537 	(p7)	add	acc0 = pr0_0, acc1_2
    538 	;;
    539 L(cj3):
    540 		.pred.rel "mutex", p10, p11
    541 .mfi;		getfsig	pr0_2 = fp0b_2
    542 		xma.l	fp1b_2 = u_2, v1, fp1a_2
    543 	(p10)	add	s0 = pr1_3, acc0, 1
    544 .mfi;	(p11)	add	s0 = pr1_3, acc0
    545 		xma.hu	fp2a_2 = u_2, v1, fp1a_2
    546 		nop	1
    547 	;;
    548 		.pred.rel "mutex", p6, p7
    549 		.pred.rel "mutex", p10, p11
    550 .mmi;		getfsig	pr1_1 = fp1b_1
    551 		st8	[rp] = s0, 8
    552 	(p6)	cmp.leu	p8, p9 = acc0, pr0_0
    553 .mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_0
    554 	(p10)	cmp.leu	p12, p13 = s0, pr1_3
    555 	(p11)	cmp.ltu	p12, p13 = s0, pr1_3
    556 	;;
    557 		.pred.rel "mutex", p8, p9
    558 .mmi;		getfsig	acc1_1 = fp2a_1
    559 	(p8)	add	acc0 = pr0_1, acc1_3, 1
    560 	(p9)	add	acc0 = pr0_1, acc1_3
    561 	;;
    562 		.pred.rel "mutex", p12, p13
    563 .mmi;	(p12)	add	s0 = pr1_0, acc0, 1
    564 	(p13)	add	s0 = pr1_0, acc0
    565 		nop	1
    566 	;;
    567 		.pred.rel "mutex", p8, p9
    568 		.pred.rel "mutex", p12, p13
    569 .mmi;		getfsig	pr1_2 = fp1b_2
    570 		st8	[rp] = s0, 8
    571 	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
    572 .mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
    573 	(p12)	cmp.leu	p10, p11 = s0, pr1_0
    574 	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
    575 	;;
    576 		.pred.rel "mutex", p6, p7
    577 .mmi;		getfsig	r8 = fp2a_2
    578 	(p6)	add	acc0 = pr0_2, acc1_0, 1
    579 	(p7)	add	acc0 = pr0_2, acc1_0
    580 	;;
    581 		.pred.rel "mutex", p10, p11
    582 .mmi;	(p10)	add	s0 = pr1_1, acc0, 1
    583 	(p11)	add	s0 = pr1_1, acc0
    584 	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
    585 	;;
    586 		.pred.rel "mutex", p10, p11
    587 .mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
    588 	(p10)	cmp.leu	p12, p13 = s0, pr1_1
    589 	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
    590 	;;
    591 		.pred.rel "mutex", p8, p9
    592 .mmi;		st8	[rp] = s0, 8
    593 	(p8)	add	acc0 = pr1_2, acc1_1, 1
    594 	(p9)	add	acc0 = pr1_2, acc1_1
    595 	;;
    596 		.pred.rel "mutex", p8, p9
    597 .mmi;	(p8)	cmp.leu	p10, p11 = acc0, pr1_2
    598 	(p9)	cmp.ltu	p10, p11 = acc0, pr1_2
    599 	(p12)	add	acc0 = 1, acc0
    600 	;;
    601 .mmi;		st8	[rp] = acc0, 8
    602 	(p12)	cmpeqor	p10, p0 = 0, acc0
    603 		nop	1
    604 	;;
    605 .mib;	(p10)	add	r8 = 1, r8
    606 		mov	ar.lc = r2
    607 		br.ret.sptk.many b0
    608 EPILOGUE()
    609 ASM_END()
    610