Home | History | Annotate | Line # | Download | only in ev6
      1 dnl  Alpha ev6 mpn_mul_1 -- Multiply a limb vector with a limb and store the
      2 dnl  result in a second limb vector.
      3 
      4 dnl  Copyright 2000, 2001, 2005 Free Software Foundation, Inc.
      5 
      6 dnl  This file is part of the GNU MP Library.
      7 dnl
      8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
      9 dnl  it under the terms of either:
     10 dnl
     11 dnl    * the GNU Lesser General Public License as published by the Free
     12 dnl      Software Foundation; either version 3 of the License, or (at your
     13 dnl      option) any later version.
     14 dnl
     15 dnl  or
     16 dnl
     17 dnl    * the GNU General Public License as published by the Free Software
     18 dnl      Foundation; either version 2 of the License, or (at your option) any
     19 dnl      later version.
     20 dnl
     21 dnl  or both in parallel, as here.
     22 dnl
     23 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     24 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     25 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     26 dnl  for more details.
     27 dnl
     28 dnl  You should have received copies of the GNU General Public License and the
     29 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     30 dnl  see https://www.gnu.org/licenses/.
     31 
     32 include(`../config.m4')
     33 
     34 C INPUT PARAMETERS
     35 C res_ptr	r16
     36 C s1_ptr	r17
     37 C size		r18
     38 C s2_limb	r19
     39 
     40 C This code runs at 2.25 cycles/limb on EV6.
     41 
     42 C This code was written in close cooperation with ev6 pipeline expert
     43 C Steve Root.  Any errors are tege's fault, though.
     44 
     45 C Code structure:
     46 
     47 C  code for n < 8
     48 C  code for n > 8	code for (n mod 8)
     49 C			code for (n div 8)	feed-in code
     50 C						8-way unrolled loop
     51 C						wind-down code
     52 
     53 C Some notes about unrolled loop:
     54 C
     55 C   r1-r8     multiplies and workup
     56 C   r21-r28   multiplies and workup
     57 C   r9-r12    loads
     58 C   r0       -1
     59 C   r20,r29,r13-r15  scramble
     60 C
     61 C   We're doing 7 of the 8 carry propagations with a br fixup code and 1 with a
     62 C   put-the-carry-into-hi.  The idea is that these branches are very rarely
     63 C   taken, and since a non-taken branch consumes no resources, that is better
     64 C   than an addq.
     65 C
     66 C   Software pipeline: a load in cycle #09, feeds a mul in cycle #16, feeds an
     67 C   add NEXT cycle #09 which feeds a store in NEXT cycle #02
     68 
     69 C The code could use some further work:
     70 C   1. Speed up really small multiplies.  The default alpha/mul_1.asm code is
     71 C      faster than this for size < 3.
     72 C   2. Improve feed-in code, perhaps with the equivalent of switch(n%8) unless
     73 C      that is too costly.
     74 C   3. Consider using 4-way unrolling, even if that runs slower.
     75 C   4. Reduce register usage.  In particular, try to avoid using r29.
     76 
     77 ASM_START()
     78 PROLOGUE(mpn_mul_1)
     79 	cmpult	r18,	8,	r1
     80 	beq	r1,	$Large
     81 $Lsmall:
     82 	ldq	r2,0(r17)	C r2 = s1_limb
     83 	lda	r18,-1(r18)	C size--
     84 	mulq	r2,r19,r3	C r3 = prod_low
     85 	bic	r31,r31,r4	C clear cy_limb
     86 	umulh	r2,r19,r0	C r0 = prod_high
     87 	beq	r18,$Le1a	C jump if size was == 1
     88 	ldq	r2,8(r17)	C r2 = s1_limb
     89 	lda	r18,-1(r18)	C size--
     90 	stq	r3,0(r16)
     91 	beq	r18,$Le2a	C jump if size was == 2
     92 	ALIGN(8)
     93 $Lopa:	mulq	r2,r19,r3	C r3 = prod_low
     94 	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
     95 	lda	r18,-1(r18)	C size--
     96 	umulh	r2,r19,r4	C r4 = cy_limb
     97 	ldq	r2,16(r17)	C r2 = s1_limb
     98 	lda	r17,8(r17)	C s1_ptr++
     99 	addq	r3,r0,r3	C r3 = cy_limb + prod_low
    100 	stq	r3,8(r16)
    101 	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
    102 	lda	r16,8(r16)	C res_ptr++
    103 	bne	r18,$Lopa
    104 
    105 $Le2a:	mulq	r2,r19,r3	C r3 = prod_low
    106 	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
    107 	umulh	r2,r19,r4	C r4 = cy_limb
    108 	addq	r3,r0,r3	C r3 = cy_limb + prod_low
    109 	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
    110 	stq	r3,8(r16)
    111 	addq	r4,r0,r0	C cy_limb = prod_high + cy
    112 	ret	r31,(r26),1
    113 $Le1a:	stq	r3,0(r16)
    114 	ret	r31,(r26),1
    115 
    116 $Large:
    117 	lda	r30,	-224(r30)
    118 	stq	r26,	0(r30)
    119 	stq	r9,	8(r30)
    120 	stq	r10,	16(r30)
    121 	stq	r11,	24(r30)
    122 	stq	r12,	32(r30)
    123 	stq	r13,	40(r30)
    124 	stq	r14,	48(r30)
    125 	stq	r15,	56(r30)
    126 	stq	r29,	64(r30)
    127 
    128 	and	r18,	7,	r20	C count for the first loop, 0-7
    129 	srl	r18,	3,	r18	C count for unrolled loop
    130 	bis	r31,	r31,	r21
    131 	beq	r20,	$L_8_or_more	C skip first loop
    132 
    133 $L_9_or_more:
    134 	ldq	r2,0(r17)	C r2 = s1_limb
    135 	lda	r17,8(r17)	C s1_ptr++
    136 	lda	r20,-1(r20)	C size--
    137 	mulq	r2,r19,r3	C r3 = prod_low
    138 	umulh	r2,r19,r21	C r21 = prod_high
    139 	beq	r20,$Le1b	C jump if size was == 1
    140 	bis	r31, r31, r0	C FIXME: shouldn't need this
    141 	ldq	r2,0(r17)	C r2 = s1_limb
    142 	lda	r17,8(r17)	C s1_ptr++
    143 	lda	r20,-1(r20)	C size--
    144 	stq	r3,0(r16)
    145 	lda	r16,8(r16)	C res_ptr++
    146 	beq	r20,$Le2b	C jump if size was == 2
    147 	ALIGN(8)
    148 $Lopb:	mulq	r2,r19,r3	C r3 = prod_low
    149 	addq	r21,r0,r0	C cy_limb = cy_limb + 'cy'
    150 	lda	r20,-1(r20)	C size--
    151 	umulh	r2,r19,r21	C r21 = prod_high
    152 	ldq	r2,0(r17)	C r2 = s1_limb
    153 	lda	r17,8(r17)	C s1_ptr++
    154 	addq	r3,r0,r3	C r3 = cy_limb + prod_low
    155 	stq	r3,0(r16)
    156 	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
    157 	lda	r16,8(r16)	C res_ptr++
    158 	bne	r20,$Lopb
    159 
    160 $Le2b:	mulq	r2,r19,r3	C r3 = prod_low
    161 	addq	r21,r0,r0	C cy_limb = cy_limb + 'cy'
    162 	umulh	r2,r19,r21	C r21 = prod_high
    163 	addq	r3,r0,r3	C r3 = cy_limb + prod_low
    164 	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
    165 	stq	r3,0(r16)
    166 	lda	r16,8(r16)	C res_ptr++
    167 	addq	r21,r0,r21	C cy_limb = prod_high + cy
    168 	br	r31,	$L_8_or_more
    169 $Le1b:	stq	r3,0(r16)
    170 	lda	r16,8(r16)	C res_ptr++
    171 
    172 $L_8_or_more:
    173 	lda	r0,	-1(r31)		C put -1 in r0, for tricky loop control
    174 	lda	r17,	-32(r17)	C L1 bookkeeping
    175 	lda	r18,	-1(r18)		C decrement count
    176 
    177 	ldq	r9,	32(r17)		C L1
    178 	ldq	r10,	40(r17)		C L1
    179 	mulq	r9,	r19,	r22	C U1 #07
    180 	ldq	r11,	48(r17)		C L1
    181 	umulh	r9,	r19,	r23	C U1 #08
    182 	ldq	r12,	56(r17)		C L1
    183 	mulq	r10,	r19,	r24	C U1 #09
    184 	ldq	r9,	64(r17)		C L1
    185 
    186 	lda	r17,	64(r17)		C L1 bookkeeping
    187 
    188 	umulh	r10,	r19,	r25	C U1 #11
    189 	mulq	r11,	r19,	r26	C U1 #12
    190 	umulh	r11,	r19,	r27	C U1 #13
    191 	mulq	r12,	r19,	r28	C U1 #14
    192 	ldq	r10,	8(r17)		C L1
    193 	umulh	r12,	r19,	r1	C U1 #15
    194 	ldq	r11,	16(r17)		C L1
    195 	mulq	r9,	r19,	r2	C U1 #16
    196 	ldq	r12,	24(r17)		C L1
    197 	umulh	r9,	r19,	r3	C U1 #17
    198 	addq	r21,	r22,	r13	C L1 mov
    199 	mulq	r10,	r19,	r4	C U1 #18
    200 	addq	r23,	r24,	r22	C L0 sum 2 mul's
    201 	cmpult	r13,	r21,	r14	C L1 carry from sum
    202 	bgt	r18,	$L_16_or_more
    203 
    204 	cmpult	r22,	r24,	r24	C U0 carry from sum
    205 	umulh	r10,	r19,	r5	C U1 #02
    206 	addq	r25,	r26,	r23	C U0 sum 2 mul's
    207 	mulq	r11,	r19,	r6	C U1 #03
    208 	cmpult	r23,	r26,	r25	C U0 carry from sum
    209 	umulh	r11,	r19,	r7	C U1 #04
    210 	addq	r27,	r28,	r28	C U0 sum 2 mul's
    211 	mulq	r12,	r19,	r8	C U1 #05
    212 	cmpult	r28,	r27,	r15	C L0 carry from sum
    213 	lda	r16,	32(r16)		C L1 bookkeeping
    214 	addq	r13,	r31,	r13	C U0 start carry cascade
    215 	umulh	r12,	r19,	r21	C U1 #06
    216 	br	r31,	$ret0c
    217 
    218 $L_16_or_more:
    219 C ---------------------------------------------------------------
    220 	subq	r18,1,r18
    221 	cmpult	r22,	r24,	r24	C U0 carry from sum
    222 	ldq	r9,	32(r17)		C L1
    223 
    224 	umulh	r10,	r19,	r5	C U1 #02
    225 	addq	r25,	r26,	r23	C U0 sum 2 mul's
    226 	mulq	r11,	r19,	r6	C U1 #03
    227 	cmpult	r23,	r26,	r25	C U0 carry from sum
    228 	umulh	r11,	r19,	r7	C U1 #04
    229 	addq	r27,	r28,	r28	C U0 sum 2 mul's
    230 	mulq	r12,	r19,	r8	C U1 #05
    231 	cmpult	r28,	r27,	r15	C L0 carry from sum
    232 	lda	r16,	32(r16)		C L1 bookkeeping
    233 	addq	r13,	r31,	r13	C U0 start carry cascade
    234 
    235 	umulh	r12,	r19,	r21	C U1 #06
    236 C	beq	r13,	$fix0w		C U0
    237 $ret0w:	addq	r22,	r14,	r26	C L0
    238 	ldq	r10,	40(r17)		C L1
    239 
    240 	mulq	r9,	r19,	r22	C U1 #07
    241 	beq	r26,	$fix1w		C U0
    242 $ret1w:	addq	r23,	r24,	r27	C L0
    243 	ldq	r11,	48(r17)		C L1
    244 
    245 	umulh	r9,	r19,	r23	C U1 #08
    246 	beq	r27,	$fix2w		C U0
    247 $ret2w:	addq	r28,	r25,	r28	C L0
    248 	ldq	r12,	56(r17)		C L1
    249 
    250 	mulq	r10,	r19,	r24	C U1 #09
    251 	beq	r28,	$fix3w		C U0
    252 $ret3w:	addq	r1,	r2,	r20	C L0 sum 2 mul's
    253 	ldq	r9,	64(r17)		C L1
    254 
    255 	addq	r3,	r4,	r2	C L0 #10 2 mul's
    256 	lda	r17,	64(r17)		C L1 bookkeeping
    257 	cmpult	r20,	r1,	r29	C U0 carry from sum
    258 
    259 	umulh	r10,	r19,	r25	C U1 #11
    260 	cmpult	r2,	r4,	r4	C U0 carry from sum
    261 	stq	r13,	-32(r16)	C L0
    262 	stq	r26,	-24(r16)	C L1
    263 
    264 	mulq	r11,	r19,	r26	C U1 #12
    265 	addq	r5,	r6,	r14	C U0 sum 2 mul's
    266 	stq	r27,	-16(r16)	C L0
    267 	stq	r28,	-8(r16)		C L1
    268 
    269 	umulh	r11,	r19,	r27	C U1 #13
    270 	cmpult	r14,	r6,	r3	C U0 carry from sum
    271 C could do cross-jumping here:
    272 C	bra	$L_middle_of_unrolled_loop
    273 	mulq	r12,	r19,	r28	C U1 #14
    274 	addq	r7,	r3,	r5	C L0 eat carry
    275 	addq	r20,	r15,	r20	C U0 carry cascade
    276 	ldq	r10,	8(r17)		C L1
    277 
    278 	umulh	r12,	r19,	r1	C U1 #15
    279 	beq	r20,	$fix4		C U0
    280 $ret4w:	addq	r2,	r29,	r6	C L0
    281 	ldq	r11,	16(r17)		C L1
    282 
    283 	mulq	r9,	r19,	r2	C U1 #16
    284 	beq	r6,	$fix5		C U0
    285 $ret5w:	addq	r14,	r4,	r7	C L0
    286 	ldq	r12,	24(r17)		C L1
    287 
    288 	umulh	r9,	r19,	r3	C U1 #17
    289 	beq	r7,	$fix6		C U0
    290 $ret6w:	addq	r5,	r8,	r8	C L0 sum 2
    291 	addq	r21,	r22,	r13	C L1 sum 2 mul's
    292 
    293 	mulq	r10,	r19,	r4	C U1 #18
    294 	addq	r23,	r24,	r22	C L0 sum 2 mul's
    295 	cmpult	r13,	r21,	r14	C L1 carry from sum
    296 	ble	r18,	$Lend		C U0
    297 C ---------------------------------------------------------------
    298 	ALIGN(16)
    299 $Loop:
    300 	umulh	r0,	r18,	r18	C U1 #01 decrement r18!
    301 	cmpult	r8,	r5,	r29	C L0 carry from last bunch
    302 	cmpult	r22,	r24,	r24	C U0 carry from sum
    303 	ldq	r9,	32(r17)		C L1
    304 
    305 	umulh	r10,	r19,	r5	C U1 #02
    306 	addq	r25,	r26,	r23	C U0 sum 2 mul's
    307 	stq	r20,	0(r16)		C L0
    308 	stq	r6,	8(r16)		C L1
    309 
    310 	mulq	r11,	r19,	r6	C U1 #03
    311 	cmpult	r23,	r26,	r25	C U0 carry from sum
    312 	stq	r7,	16(r16)		C L0
    313 	stq	r8,	24(r16)		C L1
    314 
    315 	umulh	r11,	r19,	r7	C U1 #04
    316 	bis	r31,	r31,	r31	C L0 st slosh
    317 	bis	r31,	r31,	r31	C L1 st slosh
    318 	addq	r27,	r28,	r28	C U0 sum 2 mul's
    319 
    320 	mulq	r12,	r19,	r8	C U1 #05
    321 	cmpult	r28,	r27,	r15	C L0 carry from sum
    322 	lda	r16,	64(r16)		C L1 bookkeeping
    323 	addq	r13,	r29,	r13	C U0 start carry cascade
    324 
    325 	umulh	r12,	r19,	r21	C U1 #06
    326 	beq	r13,	$fix0		C U0
    327 $ret0:	addq	r22,	r14,	r26	C L0
    328 	ldq	r10,	40(r17)		C L1
    329 
    330 	mulq	r9,	r19,	r22	C U1 #07
    331 	beq	r26,	$fix1		C U0
    332 $ret1:	addq	r23,	r24,	r27	C L0
    333 	ldq	r11,	48(r17)		C L1
    334 
    335 	umulh	r9,	r19,	r23	C U1 #08
    336 	beq	r27,	$fix2		C U0
    337 $ret2:	addq	r28,	r25,	r28	C L0
    338 	ldq	r12,	56(r17)		C L1
    339 
    340 	mulq	r10,	r19,	r24	C U1 #09
    341 	beq	r28,	$fix3		C U0
    342 $ret3:	addq	r1,	r2,	r20	C L0 sum 2 mul's
    343 	ldq	r9,	64(r17)		C L1
    344 
    345 	addq	r3,	r4,	r2	C L0 #10 2 mul's
    346 	bis	r31,	r31,	r31	C U1 mul hole
    347 	lda	r17,	64(r17)		C L1 bookkeeping
    348 	cmpult	r20,	r1,	r29	C U0 carry from sum
    349 
    350 	umulh	r10,	r19,	r25	C U1 #11
    351 	cmpult	r2,	r4,	r4	C U0 carry from sum
    352 	stq	r13,	-32(r16)	C L0
    353 	stq	r26,	-24(r16)	C L1
    354 
    355 	mulq	r11,	r19,	r26	C U1 #12
    356 	addq	r5,	r6,	r14	C U0 sum 2 mul's
    357 	stq	r27,	-16(r16)	C L0
    358 	stq	r28,	-8(r16)		C L1
    359 
    360 	umulh	r11,	r19,	r27	C U1 #13
    361 	bis	r31,	r31,	r31	C L0 st slosh
    362 	bis	r31,	r31,	r31	C L1 st slosh
    363 	cmpult	r14,	r6,	r3	C U0 carry from sum
    364 $L_middle_of_unrolled_loop:
    365 	mulq	r12,	r19,	r28	C U1 #14
    366 	addq	r7,	r3,	r5	C L0 eat carry
    367 	addq	r20,	r15,	r20	C U0 carry cascade
    368 	ldq	r10,	8(r17)		C L1
    369 
    370 	umulh	r12,	r19,	r1	C U1 #15
    371 	beq	r20,	$fix4		C U0
    372 $ret4:	addq	r2,	r29,	r6	C L0
    373 	ldq	r11,	16(r17)		C L1
    374 
    375 	mulq	r9,	r19,	r2	C U1 #16
    376 	beq	r6,	$fix5		C U0
    377 $ret5:	addq	r14,	r4,	r7	C L0
    378 	ldq	r12,	24(r17)		C L1
    379 
    380 	umulh	r9,	r19,	r3	C U1 #17
    381 	beq	r7,	$fix6		C U0
    382 $ret6:	addq	r5,	r8,	r8	C L0 sum 2
    383 	addq	r21,	r22,	r13	C L1 sum 2 mul's
    384 
    385 	mulq	r10,	r19,	r4	C U1 #18
    386 	addq	r23,	r24,	r22	C L0 sum 2 mul's
    387 	cmpult	r13,	r21,	r14	C L1 carry from sum
    388 	bgt	r18,	$Loop		C U0
    389 C ---------------------------------------------------------------
    390 $Lend:
    391 	cmpult	r8,	r5,	r29	C L0 carry from last bunch
    392 	cmpult	r22,	r24,	r24	C U0 carry from sum
    393 
    394 	umulh	r10,	r19,	r5	C U1 #02
    395 	addq	r25,	r26,	r23	C U0 sum 2 mul's
    396 	stq	r20,	0(r16)		C L0
    397 	stq	r6,	8(r16)		C L1
    398 
    399 	mulq	r11,	r19,	r6	C U1 #03
    400 	cmpult	r23,	r26,	r25	C U0 carry from sum
    401 	stq	r7,	16(r16)		C L0
    402 	stq	r8,	24(r16)		C L1
    403 
    404 	umulh	r11,	r19,	r7	C U1 #04
    405 	addq	r27,	r28,	r28	C U0 sum 2 mul's
    406 
    407 	mulq	r12,	r19,	r8	C U1 #05
    408 	cmpult	r28,	r27,	r15	C L0 carry from sum
    409 	lda	r16,	64(r16)		C L1 bookkeeping
    410 	addq	r13,	r29,	r13	C U0 start carry cascade
    411 
    412 	umulh	r12,	r19,	r21	C U1 #06
    413 	beq	r13,	$fix0c		C U0
    414 $ret0c:	addq	r22,	r14,	r26	C L0
    415 	beq	r26,	$fix1c		C U0
    416 $ret1c:	addq	r23,	r24,	r27	C L0
    417 	beq	r27,	$fix2c		C U0
    418 $ret2c:	addq	r28,	r25,	r28	C L0
    419 	beq	r28,	$fix3c		C U0
    420 $ret3c:	addq	r1,	r2,	r20	C L0 sum 2 mul's
    421 	addq	r3,	r4,	r2	C L0 #10 2 mul's
    422 	lda	r17,	64(r17)		C L1 bookkeeping
    423 	cmpult	r20,	r1,	r29	C U0 carry from sum
    424 	cmpult	r2,	r4,	r4	C U0 carry from sum
    425 	stq	r13,	-32(r16)	C L0
    426 	stq	r26,	-24(r16)	C L1
    427 	addq	r5,	r6,	r14	C U0 sum 2 mul's
    428 	stq	r27,	-16(r16)	C L0
    429 	stq	r28,	-8(r16)		C L1
    430 	cmpult	r14,	r6,	r3	C U0 carry from sum
    431 	addq	r7,	r3,	r5	C L0 eat carry
    432 	addq	r20,	r15,	r20	C U0 carry cascade
    433 	beq	r20,	$fix4c		C U0
    434 $ret4c:	addq	r2,	r29,	r6	C L0
    435 	beq	r6,	$fix5c		C U0
    436 $ret5c:	addq	r14,	r4,	r7	C L0
    437 	beq	r7,	$fix6c		C U0
    438 $ret6c:	addq	r5,	r8,	r8	C L0 sum 2
    439 	cmpult	r8,	r5,	r29	C L0 carry from last bunch
    440 	stq	r20,	0(r16)		C L0
    441 	stq	r6,	8(r16)		C L1
    442 	stq	r7,	16(r16)		C L0
    443 	stq	r8,	24(r16)		C L1
    444 	addq	r29,	r21,	r0
    445 
    446 	ldq	r26,	0(r30)
    447 	ldq	r9,	8(r30)
    448 	ldq	r10,	16(r30)
    449 	ldq	r11,	24(r30)
    450 	ldq	r12,	32(r30)
    451 	ldq	r13,	40(r30)
    452 	ldq	r14,	48(r30)
    453 	ldq	r15,	56(r30)
    454 	ldq	r29,	64(r30)
    455 	lda	r30,	224(r30)
    456 	ret	r31,	(r26),	1
    457 
    458 C $fix0w:	bis	r14,	r29,	r14	C join carries
    459 C	br	r31,	$ret0w
    460 $fix1w:	bis	r24,	r14,	r24	C join carries
    461 	br	r31,	$ret1w
    462 $fix2w:	bis	r25,	r24,	r25	C join carries
    463 	br	r31,	$ret2w
    464 $fix3w:	bis	r15,	r25,	r15	C join carries
    465 	br	r31,	$ret3w
    466 $fix0:	bis	r14,	r29,	r14	C join carries
    467 	br	r31,	$ret0
    468 $fix1:	bis	r24,	r14,	r24	C join carries
    469 	br	r31,	$ret1
    470 $fix2:	bis	r25,	r24,	r25	C join carries
    471 	br	r31,	$ret2
    472 $fix3:	bis	r15,	r25,	r15	C join carries
    473 	br	r31,	$ret3
    474 $fix4:	bis	r29,	r15,	r29	C join carries
    475 	br	r31,	$ret4
    476 $fix5:	bis	r4,	r29,	r4	C join carries
    477 	br	r31,	$ret5
    478 $fix6:	addq	r5,	r4,	r5	C can't carry twice!
    479 	br	r31,	$ret6
    480 $fix0c:	bis	r14,	r29,	r14	C join carries
    481 	br	r31,	$ret0c
    482 $fix1c:	bis	r24,	r14,	r24	C join carries
    483 	br	r31,	$ret1c
    484 $fix2c:	bis	r25,	r24,	r25	C join carries
    485 	br	r31,	$ret2c
    486 $fix3c:	bis	r15,	r25,	r15	C join carries
    487 	br	r31,	$ret3c
    488 $fix4c:	bis	r29,	r15,	r29	C join carries
    489 	br	r31,	$ret4c
    490 $fix5c:	bis	r4,	r29,	r4	C join carries
    491 	br	r31,	$ret5c
    492 $fix6c:	addq	r5,	r4,	r5	C can't carry twice!
    493 	br	r31,	$ret6c
    494 
    495 EPILOGUE(mpn_mul_1)
    496 ASM_END()
    497