Home | History | Annotate | Line # | Download | only in k8
      1  1.1  mrg dnl  AMD64 mpn_mulmid_basecase
      2  1.1  mrg 
      3  1.1  mrg dnl  Contributed by David Harvey.
      4  1.1  mrg 
      5  1.1  mrg dnl  Copyright 2011, 2012 Free Software Foundation, Inc.
      6  1.1  mrg 
      7  1.1  mrg dnl  This file is part of the GNU MP Library.
      8  1.1  mrg dnl
      9  1.1  mrg dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     10  1.1  mrg dnl  it under the terms of either:
     11  1.1  mrg dnl
     12  1.1  mrg dnl    * the GNU Lesser General Public License as published by the Free
     13  1.1  mrg dnl      Software Foundation; either version 3 of the License, or (at your
     14  1.1  mrg dnl      option) any later version.
     15  1.1  mrg dnl
     16  1.1  mrg dnl  or
     17  1.1  mrg dnl
     18  1.1  mrg dnl    * the GNU General Public License as published by the Free Software
     19  1.1  mrg dnl      Foundation; either version 2 of the License, or (at your option) any
     20  1.1  mrg dnl      later version.
     21  1.1  mrg dnl
     22  1.1  mrg dnl  or both in parallel, as here.
     23  1.1  mrg dnl
     24  1.1  mrg dnl  The GNU MP Library is distributed in the hope that it will be useful, but
     25  1.1  mrg dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     26  1.1  mrg dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     27  1.1  mrg dnl  for more details.
     28  1.1  mrg dnl
     29  1.1  mrg dnl  You should have received copies of the GNU General Public License and the
     30  1.1  mrg dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
     31  1.1  mrg dnl  see https://www.gnu.org/licenses/.
     32  1.1  mrg 
     33  1.1  mrg 
     34  1.1  mrg include(`../config.m4')
     35  1.1  mrg 
     36  1.1  mrg C	     cycles/limb
     37  1.1  mrg C K8,K9:	 2.375  (2.5 when un - vn is "small")
     38  1.1  mrg C K10:		 ?
     39  1.1  mrg C P4:		 ?
     40  1.1  mrg C P6-15:	 ?
     41  1.1  mrg 
     42  1.1  mrg C INPUT PARAMETERS
     43  1.1  mrg define(`rp',      `%rdi')
     44  1.1  mrg define(`up',      `%rsi')
     45  1.1  mrg define(`un_param',`%rdx')
     46  1.1  mrg define(`vp_param',`%rcx')
     47  1.1  mrg define(`vn',      `%r8')
     48  1.1  mrg 
     49  1.1  mrg define(`v0', `%r12')
     50  1.1  mrg define(`v1', `%r9')
     51  1.1  mrg 
     52  1.1  mrg define(`w0', `%rbx')
     53  1.1  mrg define(`w1', `%rcx')
     54  1.1  mrg define(`w2', `%rbp')
     55  1.1  mrg define(`w3', `%r10')
     56  1.1  mrg 
     57  1.1  mrg define(`n',  `%r11')
     58  1.1  mrg define(`outer_addr', `%r14')
     59  1.1  mrg define(`un',  `%r13')
     60  1.1  mrg define(`vp',  `%r15')
     61  1.1  mrg 
     62  1.1  mrg define(`vp_inner', `%r10')
     63  1.1  mrg 
     64  1.1  mrg ABI_SUPPORT(DOS64)
     65  1.1  mrg ABI_SUPPORT(STD64)
     66  1.1  mrg 
     67  1.1  mrg ASM_START()
     68  1.1  mrg 	TEXT
     69  1.1  mrg 	ALIGN(16)
     70  1.1  mrg PROLOGUE(mpn_mulmid_basecase)
     71  1.1  mrg 	FUNC_ENTRY(4)
     72  1.1  mrg IFDOS(`	mov	56(%rsp), %r8d	')
     73  1.1  mrg 	push	%rbx
     74  1.1  mrg 	push	%rbp
     75  1.1  mrg 	push	%r12
     76  1.1  mrg 	push	%r13
     77  1.1  mrg 	push	%r14
     78  1.1  mrg 	push	%r15
     79  1.1  mrg 
     80  1.1  mrg 	mov	vp_param, vp
     81  1.1  mrg 
     82  1.1  mrg 	C use un for row length (= un_param - vn + 1)
     83  1.1  mrg 	lea	1(un_param), un
     84  1.1  mrg 	sub	vn, un
     85  1.1  mrg 
     86  1.1  mrg 	lea	(rp,un,8), rp
     87  1.1  mrg 
     88  1.1  mrg 	cmp	$4, un		C TODO: needs tuning
     89  1.1  mrg 	jc	L(diagonal)
     90  1.1  mrg 
     91  1.1  mrg 	lea	(up,un_param,8), up
     92  1.1  mrg 
     93  1.1  mrg 	test	$1, vn
     94  1.1  mrg 	jz	L(mul_2)
     95  1.1  mrg 
     96  1.1  mrg C ===========================================================
     97  1.1  mrg C     mul_1 for vp[0] if vn is odd
     98  1.1  mrg 
     99  1.1  mrg L(mul_1):
    100  1.1  mrg 	mov	R32(un), R32(w0)
    101  1.1  mrg 
    102  1.1  mrg 	neg	un
    103  1.1  mrg 	mov	(up,un,8), %rax
    104  1.1  mrg 	mov	(vp), v0
    105  1.1  mrg 	mul	v0
    106  1.1  mrg 
    107  1.1  mrg 	and	$-4, un		C round down to multiple of 4
    108  1.1  mrg 	mov	un, n
    109  1.1  mrg 
    110  1.1  mrg 	and	$3, R32(w0)
    111  1.1  mrg 	jz	L(mul_1_prologue_0)
    112  1.1  mrg 	cmp	$2, R32(w0)
    113  1.1  mrg 	jc	L(mul_1_prologue_1)
    114  1.1  mrg 	jz	L(mul_1_prologue_2)
    115  1.1  mrg 
    116  1.1  mrg L(mul_1_prologue_3):
    117  1.1  mrg 	mov	%rax, w3
    118  1.1  mrg 	mov	%rdx, w0
    119  1.1  mrg 	lea	L(addmul_prologue_3)(%rip), outer_addr
    120  1.1  mrg 	jmp	L(mul_1_entry_3)
    121  1.1  mrg 
    122  1.1  mrg 	ALIGN(16)
    123  1.1  mrg L(mul_1_prologue_0):
    124  1.1  mrg 	mov	%rax, w2
    125  1.1  mrg 	mov	%rdx, w3		C note already w0 == 0
    126  1.1  mrg 	lea	L(addmul_prologue_0)(%rip), outer_addr
    127  1.1  mrg 	jmp	L(mul_1_entry_0)
    128  1.1  mrg 
    129  1.1  mrg 	ALIGN(16)
    130  1.1  mrg L(mul_1_prologue_1):
    131  1.1  mrg 	add	$4, n
    132  1.1  mrg 	mov	%rax, w1
    133  1.1  mrg 	mov	%rdx, w2
    134  1.1  mrg 	mov	$0, R32(w3)
    135  1.1  mrg 	mov	(up,n,8), %rax
    136  1.1  mrg 	lea	L(addmul_prologue_1)(%rip), outer_addr
    137  1.1  mrg 	jmp	L(mul_1_entry_1)
    138  1.1  mrg 
    139  1.1  mrg 	ALIGN(16)
    140  1.1  mrg L(mul_1_prologue_2):
    141  1.1  mrg 	mov	%rax, w0
    142  1.1  mrg 	mov	%rdx, w1
    143  1.1  mrg 	mov	24(up,n,8), %rax
    144  1.1  mrg 	mov	$0, R32(w2)
    145  1.1  mrg 	mov	$0, R32(w3)
    146  1.1  mrg 	lea	L(addmul_prologue_2)(%rip), outer_addr
    147  1.1  mrg 	jmp	L(mul_1_entry_2)
    148  1.1  mrg 
    149  1.1  mrg 
    150  1.1  mrg 	C this loop is 10 c/loop = 2.5 c/l on K8
    151  1.1  mrg 
    152  1.1  mrg 	ALIGN(16)
    153  1.1  mrg L(mul_1_top):
    154  1.1  mrg 	mov	w0, -16(rp,n,8)
    155  1.1  mrg 	add	%rax, w1
    156  1.1  mrg 	mov	(up,n,8), %rax
    157  1.1  mrg 	adc	%rdx, w2
    158  1.1  mrg L(mul_1_entry_1):
    159  1.1  mrg 	mov	$0, R32(w0)
    160  1.1  mrg 	mul	v0
    161  1.1  mrg 	mov	w1, -8(rp,n,8)
    162  1.1  mrg 	add	%rax, w2
    163  1.1  mrg 	adc	%rdx, w3
    164  1.1  mrg L(mul_1_entry_0):
    165  1.1  mrg 	mov	8(up,n,8), %rax
    166  1.1  mrg 	mul	v0
    167  1.1  mrg 	mov	w2, (rp,n,8)
    168  1.1  mrg 	add	%rax, w3
    169  1.1  mrg 	adc	%rdx, w0
    170  1.1  mrg L(mul_1_entry_3):
    171  1.1  mrg 	mov	16(up,n,8), %rax
    172  1.1  mrg 	mul	v0
    173  1.1  mrg 	mov	w3, 8(rp,n,8)
    174  1.1  mrg 	mov	$0, R32(w2)		C zero
    175  1.1  mrg 	mov	w2, w3			C zero
    176  1.1  mrg 	add	%rax, w0
    177  1.1  mrg 	mov	24(up,n,8), %rax
    178  1.1  mrg 	mov	w2, w1			C zero
    179  1.1  mrg 	adc	%rdx, w1
    180  1.1  mrg L(mul_1_entry_2):
    181  1.1  mrg 	mul	v0
    182  1.1  mrg 	add	$4, n
    183  1.1  mrg 	js	L(mul_1_top)
    184  1.1  mrg 
    185  1.1  mrg 	mov	w0, -16(rp)
    186  1.1  mrg 	add	%rax, w1
    187  1.1  mrg 	mov	w1, -8(rp)
    188  1.1  mrg 	mov	w2, 8(rp)		C zero last limb of output
    189  1.1  mrg 	adc	%rdx, w2
    190  1.1  mrg 	mov	w2, (rp)
    191  1.1  mrg 
    192  1.1  mrg 	dec	vn
    193  1.1  mrg 	jz	L(ret)
    194  1.1  mrg 
    195  1.1  mrg 	lea	-8(up), up
    196  1.1  mrg 	lea	8(vp), vp
    197  1.1  mrg 
    198  1.1  mrg 	mov	un, n
    199  1.1  mrg 	mov	(vp), v0
    200  1.1  mrg 	mov	8(vp), v1
    201  1.1  mrg 
    202  1.1  mrg 	jmp	*outer_addr
    203  1.1  mrg 
    204  1.1  mrg C ===========================================================
    205  1.1  mrg C     mul_2 for vp[0], vp[1] if vn is even
    206  1.1  mrg 
    207  1.1  mrg 	ALIGN(16)
    208  1.1  mrg L(mul_2):
    209  1.1  mrg 	mov	R32(un), R32(w0)
    210  1.1  mrg 
    211  1.1  mrg 	neg	un
    212  1.1  mrg 	mov	-8(up,un,8), %rax
    213  1.1  mrg 	mov	(vp), v0
    214  1.1  mrg 	mov	8(vp), v1
    215  1.1  mrg 	mul	v1
    216  1.1  mrg 
    217  1.1  mrg 	and	$-4, un		C round down to multiple of 4
    218  1.1  mrg 	mov	un, n
    219  1.1  mrg 
    220  1.1  mrg 	and	$3, R32(w0)
    221  1.1  mrg 	jz	L(mul_2_prologue_0)
    222  1.1  mrg 	cmp	$2, R32(w0)
    223  1.1  mrg 	jc	L(mul_2_prologue_1)
    224  1.1  mrg 	jz	L(mul_2_prologue_2)
    225  1.1  mrg 
    226  1.1  mrg L(mul_2_prologue_3):
    227  1.1  mrg 	mov	%rax, w1
    228  1.1  mrg 	mov	%rdx, w2
    229  1.1  mrg 	lea	L(addmul_prologue_3)(%rip), outer_addr
    230  1.1  mrg 	jmp	L(mul_2_entry_3)
    231  1.1  mrg 
    232  1.1  mrg 	ALIGN(16)
    233  1.1  mrg L(mul_2_prologue_0):
    234  1.1  mrg 	mov	%rax, w0
    235  1.1  mrg 	mov	%rdx, w1
    236  1.1  mrg 	lea	L(addmul_prologue_0)(%rip), outer_addr
    237  1.1  mrg 	jmp	L(mul_2_entry_0)
    238  1.1  mrg 
    239  1.1  mrg 	ALIGN(16)
    240  1.1  mrg L(mul_2_prologue_1):
    241  1.1  mrg 	mov	%rax, w3
    242  1.1  mrg 	mov	%rdx, w0
    243  1.1  mrg 	mov	$0, R32(w1)
    244  1.1  mrg 	lea	L(addmul_prologue_1)(%rip), outer_addr
    245  1.1  mrg 	jmp	L(mul_2_entry_1)
    246  1.1  mrg 
    247  1.1  mrg 	ALIGN(16)
    248  1.1  mrg L(mul_2_prologue_2):
    249  1.1  mrg 	mov	%rax, w2
    250  1.1  mrg 	mov	%rdx, w3
    251  1.1  mrg 	mov	$0, R32(w0)
    252  1.1  mrg 	mov	16(up,n,8), %rax
    253  1.1  mrg 	lea	L(addmul_prologue_2)(%rip), outer_addr
    254  1.1  mrg 	jmp	L(mul_2_entry_2)
    255  1.1  mrg 
    256  1.1  mrg 
    257  1.1  mrg 	C this loop is 18 c/loop = 2.25 c/l on K8
    258  1.1  mrg 
    259  1.1  mrg 	ALIGN(16)
    260  1.1  mrg L(mul_2_top):
    261  1.1  mrg 	mov     -8(up,n,8), %rax
    262  1.1  mrg 	mul     v1
    263  1.1  mrg 	add     %rax, w0
    264  1.1  mrg 	adc     %rdx, w1
    265  1.1  mrg L(mul_2_entry_0):
    266  1.1  mrg 	mov     $0, R32(w2)
    267  1.1  mrg 	mov     (up,n,8), %rax
    268  1.1  mrg 	mul     v0
    269  1.1  mrg 	add     %rax, w0
    270  1.1  mrg 	mov     (up,n,8), %rax
    271  1.1  mrg 	adc     %rdx, w1
    272  1.1  mrg 	adc     $0, R32(w2)
    273  1.1  mrg 	mul     v1
    274  1.1  mrg 	add     %rax, w1
    275  1.1  mrg 	mov     w0, (rp,n,8)
    276  1.1  mrg 	adc     %rdx, w2
    277  1.1  mrg L(mul_2_entry_3):
    278  1.1  mrg 	mov     8(up,n,8), %rax
    279  1.1  mrg 	mul     v0
    280  1.1  mrg 	mov     $0, R32(w3)
    281  1.1  mrg 	add     %rax, w1
    282  1.1  mrg 	adc     %rdx, w2
    283  1.1  mrg 	mov     $0, R32(w0)
    284  1.1  mrg 	adc     $0, R32(w3)
    285  1.1  mrg 	mov     8(up,n,8), %rax
    286  1.1  mrg 	mov     w1, 8(rp,n,8)
    287  1.1  mrg 	mul     v1
    288  1.1  mrg 	add     %rax, w2
    289  1.1  mrg 	mov     16(up,n,8), %rax
    290  1.1  mrg 	adc     %rdx, w3
    291  1.1  mrg L(mul_2_entry_2):
    292  1.1  mrg 	mov     $0, R32(w1)
    293  1.1  mrg 	mul     v0
    294  1.1  mrg 	add     %rax, w2
    295  1.1  mrg 	mov     16(up,n,8), %rax
    296  1.1  mrg 	adc     %rdx, w3
    297  1.1  mrg 	adc     $0, R32(w0)
    298  1.1  mrg 	mul     v1
    299  1.1  mrg 	add     %rax, w3
    300  1.1  mrg 	mov     w2, 16(rp,n,8)
    301  1.1  mrg 	adc     %rdx, w0
    302  1.1  mrg L(mul_2_entry_1):
    303  1.1  mrg 	mov     24(up,n,8), %rax
    304  1.1  mrg 	mul     v0
    305  1.1  mrg 	add     %rax, w3
    306  1.1  mrg 	adc     %rdx, w0
    307  1.1  mrg 	adc     $0, R32(w1)
    308  1.1  mrg 	add     $4, n
    309  1.1  mrg 	mov     w3, -8(rp,n,8)
    310  1.1  mrg 	jnz     L(mul_2_top)
    311  1.1  mrg 
    312  1.1  mrg 	mov	w0, (rp)
    313  1.1  mrg 	mov	w1, 8(rp)
    314  1.1  mrg 
    315  1.1  mrg 	sub	$2, vn
    316  1.1  mrg 	jz	L(ret)
    317  1.1  mrg 
    318  1.1  mrg 	lea	16(vp), vp
    319  1.1  mrg 	lea	-16(up), up
    320  1.1  mrg 
    321  1.1  mrg 	mov	un, n
    322  1.1  mrg 	mov	(vp), v0
    323  1.1  mrg 	mov	8(vp), v1
    324  1.1  mrg 
    325  1.1  mrg 	jmp	*outer_addr
    326  1.1  mrg 
    327  1.1  mrg C ===========================================================
    328  1.1  mrg C     addmul_2 for remaining vp's
    329  1.1  mrg 
    330  1.1  mrg 	ALIGN(16)
    331  1.1  mrg L(addmul_prologue_0):
    332  1.1  mrg 	mov	-8(up,n,8), %rax
    333  1.1  mrg 	mul	v1
    334  1.1  mrg 	mov	%rax, w1
    335  1.1  mrg 	mov	%rdx, w2
    336  1.1  mrg 	mov	$0, R32(w3)
    337  1.1  mrg 	jmp	L(addmul_entry_0)
    338  1.1  mrg 
    339  1.1  mrg 	ALIGN(16)
    340  1.1  mrg L(addmul_prologue_1):
    341  1.1  mrg 	mov	16(up,n,8), %rax
    342  1.1  mrg 	mul	v1
    343  1.1  mrg 	mov	%rax, w0
    344  1.1  mrg 	mov	%rdx, w1
    345  1.1  mrg 	mov	$0, R32(w2)
    346  1.1  mrg 	mov	24(up,n,8), %rax
    347  1.1  mrg 	jmp	L(addmul_entry_1)
    348  1.1  mrg 
    349  1.1  mrg 	ALIGN(16)
    350  1.1  mrg L(addmul_prologue_2):
    351  1.1  mrg 	mov	8(up,n,8), %rax
    352  1.1  mrg 	mul	v1
    353  1.1  mrg 	mov	%rax, w3
    354  1.1  mrg 	mov	%rdx, w0
    355  1.1  mrg 	mov	$0, R32(w1)
    356  1.1  mrg 	jmp	L(addmul_entry_2)
    357  1.1  mrg 
    358  1.1  mrg 	ALIGN(16)
    359  1.1  mrg L(addmul_prologue_3):
    360  1.1  mrg 	mov	(up,n,8), %rax
    361  1.1  mrg 	mul	v1
    362  1.1  mrg 	mov	%rax, w2
    363  1.1  mrg 	mov	%rdx, w3
    364  1.1  mrg 	mov	$0, R32(w0)
    365  1.1  mrg 	mov	$0, R32(w1)
    366  1.1  mrg 	jmp	L(addmul_entry_3)
    367  1.1  mrg 
    368  1.1  mrg 	C this loop is 19 c/loop = 2.375 c/l on K8
    369  1.1  mrg 
    370  1.1  mrg 	ALIGN(16)
    371  1.1  mrg L(addmul_top):
    372  1.1  mrg 	mov	$0, R32(w3)
    373  1.1  mrg 	add	%rax, w0
    374  1.1  mrg 	mov	-8(up,n,8), %rax
    375  1.1  mrg 	adc	%rdx, w1
    376  1.1  mrg 	adc	$0, R32(w2)
    377  1.1  mrg 	mul	v1
    378  1.1  mrg 	add	w0, -8(rp,n,8)
    379  1.1  mrg 	adc	%rax, w1
    380  1.1  mrg 	adc	%rdx, w2
    381  1.1  mrg L(addmul_entry_0):
    382  1.1  mrg 	mov	(up,n,8), %rax
    383  1.1  mrg 	mul	v0
    384  1.1  mrg 	add	%rax, w1
    385  1.1  mrg 	mov	(up,n,8), %rax
    386  1.1  mrg 	adc	%rdx, w2
    387  1.1  mrg 	adc	$0, R32(w3)
    388  1.1  mrg 	mul	v1
    389  1.1  mrg 	add	w1, (rp,n,8)
    390  1.1  mrg 	mov	$0, R32(w1)
    391  1.1  mrg 	adc	%rax, w2
    392  1.1  mrg 	mov	$0, R32(w0)
    393  1.1  mrg 	adc	%rdx, w3
    394  1.1  mrg L(addmul_entry_3):
    395  1.1  mrg 	mov	8(up,n,8), %rax
    396  1.1  mrg 	mul	v0
    397  1.1  mrg 	add	%rax, w2
    398  1.1  mrg 	mov	8(up,n,8), %rax
    399  1.1  mrg 	adc	%rdx, w3
    400  1.1  mrg 	adc	$0, R32(w0)
    401  1.1  mrg 	mul	v1
    402  1.1  mrg 	add	w2, 8(rp,n,8)
    403  1.1  mrg 	adc	%rax, w3
    404  1.1  mrg 	adc	%rdx, w0
    405  1.1  mrg L(addmul_entry_2):
    406  1.1  mrg 	mov	16(up,n,8), %rax
    407  1.1  mrg 	mul	v0
    408  1.1  mrg 	add	%rax, w3
    409  1.1  mrg 	mov	16(up,n,8), %rax
    410  1.1  mrg 	adc	%rdx, w0
    411  1.1  mrg 	adc	$0, R32(w1)
    412  1.1  mrg 	mul	v1
    413  1.1  mrg 	add	w3, 16(rp,n,8)
    414  1.1  mrg 	nop			C don't ask...
    415  1.1  mrg 	adc	%rax, w0
    416  1.1  mrg 	mov	$0, R32(w2)
    417  1.1  mrg 	mov	24(up,n,8), %rax
    418  1.1  mrg 	adc	%rdx, w1
    419  1.1  mrg L(addmul_entry_1):
    420  1.1  mrg 	mul	v0
    421  1.1  mrg 	add	$4, n
    422  1.1  mrg 	jnz	L(addmul_top)
    423  1.1  mrg 
    424  1.1  mrg 	add	%rax, w0
    425  1.1  mrg 	adc	%rdx, w1
    426  1.1  mrg 	adc	$0, R32(w2)
    427  1.1  mrg 
    428  1.1  mrg 	add	w0, -8(rp)
    429  1.1  mrg 	adc	w1, (rp)
    430  1.1  mrg 	adc	w2, 8(rp)
    431  1.1  mrg 
    432  1.1  mrg 	sub	$2, vn
    433  1.1  mrg 	jz	L(ret)
    434  1.1  mrg 
    435  1.1  mrg 	lea	16(vp), vp
    436  1.1  mrg 	lea	-16(up), up
    437  1.1  mrg 
    438  1.1  mrg 	mov	un, n
    439  1.1  mrg 	mov	(vp), v0
    440  1.1  mrg 	mov	8(vp), v1
    441  1.1  mrg 
    442  1.1  mrg 	jmp	*outer_addr
    443  1.1  mrg 
    444  1.1  mrg C ===========================================================
    445  1.1  mrg C     accumulate along diagonals if un - vn is small
    446  1.1  mrg 
    447  1.1  mrg 	ALIGN(16)
    448  1.1  mrg L(diagonal):
    449  1.1  mrg 	xor	R32(w0), R32(w0)
    450  1.1  mrg 	xor	R32(w1), R32(w1)
    451  1.1  mrg 	xor	R32(w2), R32(w2)
    452  1.1  mrg 
    453  1.1  mrg 	neg	un
    454  1.1  mrg 
    455  1.1  mrg 	mov	R32(vn), %eax
    456  1.1  mrg 	and	$3, %eax
    457  1.1  mrg 	jz	L(diag_prologue_0)
    458  1.1  mrg 	cmp	$2, %eax
    459  1.1  mrg 	jc	L(diag_prologue_1)
    460  1.1  mrg 	jz	L(diag_prologue_2)
    461  1.1  mrg 
    462  1.1  mrg L(diag_prologue_3):
    463  1.1  mrg 	lea	-8(vp), vp
    464  1.1  mrg 	mov	vp, vp_inner
    465  1.1  mrg 	add	$1, vn
    466  1.1  mrg 	mov	vn, n
    467  1.1  mrg 	lea	L(diag_entry_3)(%rip), outer_addr
    468  1.1  mrg 	jmp	L(diag_entry_3)
    469  1.1  mrg 
    470  1.1  mrg L(diag_prologue_0):
    471  1.1  mrg 	mov	vp, vp_inner
    472  1.1  mrg 	mov	vn, n
    473  1.1  mrg 	lea	0(%rip), outer_addr
    474  1.1  mrg 	mov     -8(up,n,8), %rax
    475  1.1  mrg 	jmp	L(diag_entry_0)
    476  1.1  mrg 
    477  1.1  mrg L(diag_prologue_1):
    478  1.1  mrg 	lea	8(vp), vp
    479  1.1  mrg 	mov	vp, vp_inner
    480  1.1  mrg 	add	$3, vn
    481  1.1  mrg 	mov	vn, n
    482  1.1  mrg 	lea	0(%rip), outer_addr
    483  1.1  mrg 	mov     -8(vp_inner), %rax
    484  1.1  mrg 	jmp	L(diag_entry_1)
    485  1.1  mrg 
    486  1.1  mrg L(diag_prologue_2):
    487  1.1  mrg 	lea	-16(vp), vp
    488  1.1  mrg 	mov	vp, vp_inner
    489  1.1  mrg 	add	$2, vn
    490  1.1  mrg 	mov	vn, n
    491  1.1  mrg 	lea	0(%rip), outer_addr
    492  1.1  mrg 	mov	16(vp_inner), %rax
    493  1.1  mrg 	jmp	L(diag_entry_2)
    494  1.1  mrg 
    495  1.1  mrg 
    496  1.1  mrg 	C this loop is 10 c/loop = 2.5 c/l on K8
    497  1.1  mrg 
    498  1.1  mrg 	ALIGN(16)
    499  1.1  mrg L(diag_top):
    500  1.1  mrg 	add     %rax, w0
    501  1.1  mrg 	adc     %rdx, w1
    502  1.1  mrg 	mov     -8(up,n,8), %rax
    503  1.1  mrg 	adc     $0, w2
    504  1.1  mrg L(diag_entry_0):
    505  1.1  mrg 	mulq    (vp_inner)
    506  1.1  mrg 	add     %rax, w0
    507  1.1  mrg 	adc     %rdx, w1
    508  1.1  mrg 	adc     $0, w2
    509  1.1  mrg L(diag_entry_3):
    510  1.1  mrg 	mov     -16(up,n,8), %rax
    511  1.1  mrg 	mulq    8(vp_inner)
    512  1.1  mrg 	add     %rax, w0
    513  1.1  mrg 	mov     16(vp_inner), %rax
    514  1.1  mrg 	adc     %rdx, w1
    515  1.1  mrg 	adc     $0, w2
    516  1.1  mrg L(diag_entry_2):
    517  1.1  mrg 	mulq    -24(up,n,8)
    518  1.1  mrg 	add     %rax, w0
    519  1.1  mrg 	mov     24(vp_inner), %rax
    520  1.1  mrg 	adc     %rdx, w1
    521  1.1  mrg 	lea     32(vp_inner), vp_inner
    522  1.1  mrg 	adc     $0, w2
    523  1.1  mrg L(diag_entry_1):
    524  1.1  mrg 	mulq    -32(up,n,8)
    525  1.1  mrg 	sub     $4, n
    526  1.1  mrg 	jnz	L(diag_top)
    527  1.1  mrg 
    528  1.1  mrg 	add	%rax, w0
    529  1.1  mrg 	adc	%rdx, w1
    530  1.1  mrg 	adc	$0, w2
    531  1.1  mrg 
    532  1.1  mrg 	mov	w0, (rp,un,8)
    533  1.1  mrg 
    534  1.1  mrg 	inc	un
    535  1.1  mrg 	jz	L(diag_end)
    536  1.1  mrg 
    537  1.1  mrg 	mov	vn, n
    538  1.1  mrg 	mov	vp, vp_inner
    539  1.1  mrg 
    540  1.1  mrg 	lea	8(up), up
    541  1.1  mrg 	mov	w1, w0
    542  1.1  mrg 	mov	w2, w1
    543  1.1  mrg 	xor	R32(w2), R32(w2)
    544  1.1  mrg 
    545  1.1  mrg 	jmp	*outer_addr
    546  1.1  mrg 
    547  1.1  mrg L(diag_end):
    548  1.1  mrg 	mov	w1, (rp)
    549  1.1  mrg 	mov	w2, 8(rp)
    550  1.1  mrg 
    551  1.1  mrg L(ret):	pop	%r15
    552  1.1  mrg 	pop	%r14
    553  1.1  mrg 	pop	%r13
    554  1.1  mrg 	pop	%r12
    555  1.1  mrg 	pop	%rbp
    556  1.1  mrg 	pop	%rbx
    557  1.1  mrg 	FUNC_EXIT()
    558  1.1  mrg 	ret
    559  1.1  mrg EPILOGUE()
    560