sublsh1_n.asm revision 1.1.1.1.2.1 1 dnl AMD64 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1)
2
3 dnl Copyright 2003, 2005, 2006, 2007, 2011, 2012 Free Software Foundation,
4 dnl Inc.
5
6 dnl This file is part of the GNU MP Library.
7
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of the GNU Lesser General Public License as published
10 dnl by the Free Software Foundation; either version 3 of the License, or (at
11 dnl your option) any later version.
12
13 dnl The GNU MP Library is distributed in the hope that it will be useful, but
14 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 dnl License for more details.
17
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20
21 include(`../config.m4')
22
23
24 C cycles/limb
25 C AMD K8,K9 2.2
26 C AMD K10 2.2
27 C Intel P4 12.75
28 C Intel core2 3.45
29 C Intel corei ?
30 C Intel atom ?
31 C VIA nano 3.25
32
33 C Sometimes speed degenerates, supposedly related to that some operand
34 C alignments cause cache conflicts.
35
36 C The speed is limited by decoding/issue bandwidth. There are 26 instructions
37 C in the loop, which corresponds to 26/3/4 = 2.167 c/l.
38
39 C INPUT PARAMETERS
40 define(`rp',`%rdi')
41 define(`up',`%rsi')
42 define(`vp',`%rdx')
43 define(`n', `%rcx')
44
45 ABI_SUPPORT(DOS64)
46 ABI_SUPPORT(STD64)
47
48 ASM_START()
49 TEXT
50 ALIGN(16)
51 PROLOGUE(mpn_sublsh1_n)
52 FUNC_ENTRY(4)
53 push %rbx
54 push %rbp
55
56 mov (vp), %r8
57 mov R32(n), R32(%rax)
58 lea (rp,n,8), rp
59 lea (up,n,8), up
60 lea (vp,n,8), vp
61 neg n
62 xor R32(%rbp), R32(%rbp)
63 and $3, R32(%rax)
64 je L(b00)
65 cmp $2, R32(%rax)
66 jc L(b01)
67 je L(b10)
68
69 L(b11): add %r8, %r8
70 mov 8(vp,n,8), %r9
71 adc %r9, %r9
72 mov 16(vp,n,8), %r10
73 adc %r10, %r10
74 sbb R32(%rax), R32(%rax) C save scy
75 mov (up,n,8), %rbp
76 mov 8(up,n,8), %rbx
77 sub %r8, %rbp
78 sbb %r9, %rbx
79 mov %rbp, (rp,n,8)
80 mov %rbx, 8(rp,n,8)
81 mov 16(up,n,8), %rbp
82 sbb %r10, %rbp
83 mov %rbp, 16(rp,n,8)
84 sbb R32(%rbp), R32(%rbp) C save acy
85 add $3, n
86 jmp L(ent)
87
88 L(b10): add %r8, %r8
89 mov 8(vp,n,8), %r9
90 adc %r9, %r9
91 sbb R32(%rax), R32(%rax) C save scy
92 mov (up,n,8), %rbp
93 mov 8(up,n,8), %rbx
94 sub %r8, %rbp
95 sbb %r9, %rbx
96 mov %rbp, (rp,n,8)
97 mov %rbx, 8(rp,n,8)
98 sbb R32(%rbp), R32(%rbp) C save acy
99 add $2, n
100 jmp L(ent)
101
102 L(b01): add %r8, %r8
103 sbb R32(%rax), R32(%rax) C save scy
104 mov (up,n,8), %rbp
105 sub %r8, %rbp
106 mov %rbp, (rp,n,8)
107 sbb R32(%rbp), R32(%rbp) C save acy
108 inc n
109 L(ent): jns L(end)
110
111 ALIGN(16)
112 L(top): add R32(%rax), R32(%rax) C restore scy
113
114 mov (vp,n,8), %r8
115 L(b00): adc %r8, %r8
116 mov 8(vp,n,8), %r9
117 adc %r9, %r9
118 mov 16(vp,n,8), %r10
119 adc %r10, %r10
120 mov 24(vp,n,8), %r11
121 adc %r11, %r11
122
123 sbb R32(%rax), R32(%rax) C save scy
124 add R32(%rbp), R32(%rbp) C restore acy
125
126 mov (up,n,8), %rbp
127 mov 8(up,n,8), %rbx
128 sbb %r8, %rbp
129 sbb %r9, %rbx
130 mov %rbp, (rp,n,8)
131 mov %rbx, 8(rp,n,8)
132 mov 16(up,n,8), %rbp
133 mov 24(up,n,8), %rbx
134 sbb %r10, %rbp
135 sbb %r11, %rbx
136 mov %rbp, 16(rp,n,8)
137 mov %rbx, 24(rp,n,8)
138
139 sbb R32(%rbp), R32(%rbp) C save acy
140 add $4, n
141 js L(top)
142
143 L(end): add R32(%rbp), R32(%rax)
144 neg R32(%rax)
145
146 pop %rbp
147 pop %rbx
148 FUNC_EXIT()
149 ret
150 EPILOGUE()
151