rshift.asm revision 1.1.1.1.2.1 1 dnl PowerPC-64 mpn_rshift -- rp[] = up[] >> cnt
2
3 dnl Copyright 2003, 2005, 2010, 2011 Free Software Foundation, Inc.
4
5 dnl This file is part of the GNU MP Library.
6
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of the GNU Lesser General Public License as published
9 dnl by the Free Software Foundation; either version 3 of the License, or (at
10 dnl your option) any later version.
11
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 dnl License for more details.
16
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
19
20 include(`../config.m4')
21
22 C cycles/limb
23 C POWER3/PPC630 ?
24 C POWER4/PPC970 ?
25 C POWER5 2.25
26 C POWER6 9.75
27 C POWER7 2.15
28
29 C TODO
30 C * Try to reduce the number of needed live registers
31 C * Micro-optimise header code
32 C * Keep in synch with lshift.asm and lshiftc.asm
33
34 C INPUT PARAMETERS
35 define(`rp', `r3')
36 define(`up', `r4')
37 define(`n', `r5')
38 define(`cnt', `r6')
39
40 define(`tnc',`r0')
41 define(`u0',`r30')
42 define(`u1',`r31')
43 define(`retval',`r5')
44
45 ASM_START()
46 PROLOGUE(mpn_rshift)
47 std r31, -8(r1)
48 std r30, -16(r1)
49 subfic tnc, cnt, 64
50 C sldi r30, n, 3 C byte count corresponding to n
51 C add rp, rp, r30 C rp = rp + n
52 C add up, up, r30 C up = up + n
53 rldicl. r30, n, 0,62 C r30 = n & 3, set cr0
54 cmpdi cr6, r30, 2
55 addi r31, n, 3 C compute count...
56 ld r10, 0(up) C load 1st limb for b00...b11
57 sld retval, r10, tnc
58 ifdef(`HAVE_ABI_mode32',
59 ` rldicl r31, r31, 62,34', C ...branch count
60 ` srdi r31, r31, 2') C ...for ctr
61 mtctr r31 C copy count into ctr
62 beq cr0, L(b00)
63 blt cr6, L(b01)
64 ld r11, 8(up) C load 2nd limb for b10 and b11
65 beq cr6, L(b10)
66
67 ALIGN(16)
68 L(b11): srd r8, r10, cnt
69 sld r9, r11, tnc
70 ld u1, 16(up)
71 addi up, up, 24
72 srd r12, r11, cnt
73 sld r7, u1, tnc
74 addi rp, rp, -16
75 bdnz L(gt3)
76
77 or r11, r8, r9
78 srd r8, u1, cnt
79 b L(cj3)
80
81 ALIGN(16)
82 L(gt3): ld u0, 0(up)
83 or r11, r8, r9
84 srd r8, u1, cnt
85 sld r9, u0, tnc
86 ld u1, 8(up)
87 or r10, r12, r7
88 b L(L11)
89
90 ALIGN(32)
91 L(b10): srd r12, r10, cnt
92 addi rp, rp, -24
93 sld r7, r11, tnc
94 bdnz L(gt2)
95
96 srd r8, r11, cnt
97 or r10, r12, r7
98 b L(cj2)
99
100 L(gt2): ld u0, 16(up)
101 srd r8, r11, cnt
102 sld r9, u0, tnc
103 ld u1, 24(up)
104 or r10, r12, r7
105 srd r12, u0, cnt
106 sld r7, u1, tnc
107 ld u0, 32(up)
108 or r11, r8, r9
109 addi up, up, 16
110 b L(L10)
111
112 ALIGN(16)
113 L(b00): ld u1, 8(up)
114 srd r12, r10, cnt
115 sld r7, u1, tnc
116 ld u0, 16(up)
117 srd r8, u1, cnt
118 sld r9, u0, tnc
119 ld u1, 24(up)
120 or r10, r12, r7
121 srd r12, u0, cnt
122 sld r7, u1, tnc
123 addi rp, rp, -8
124 bdz L(cj4)
125
126 L(gt4): addi up, up, 32
127 ld u0, 0(up)
128 or r11, r8, r9
129 b L(L00)
130
131 ALIGN(16)
132 L(b01): bdnz L(gt1)
133 srd r8, r10, cnt
134 std r8, 0(rp)
135 b L(ret)
136
137 L(gt1): ld u0, 8(up)
138 srd r8, r10, cnt
139 sld r9, u0, tnc
140 ld u1, 16(up)
141 srd r12, u0, cnt
142 sld r7, u1, tnc
143 ld u0, 24(up)
144 or r11, r8, r9
145 srd r8, u1, cnt
146 sld r9, u0, tnc
147 ld u1, 32(up)
148 addi up, up, 40
149 or r10, r12, r7
150 bdz L(end)
151
152 ALIGN(32)
153 L(top): srd r12, u0, cnt
154 sld r7, u1, tnc
155 ld u0, 0(up)
156 std r11, 0(rp)
157 or r11, r8, r9
158 L(L00): srd r8, u1, cnt
159 sld r9, u0, tnc
160 ld u1, 8(up)
161 std r10, 8(rp)
162 or r10, r12, r7
163 L(L11): srd r12, u0, cnt
164 sld r7, u1, tnc
165 ld u0, 16(up)
166 std r11, 16(rp)
167 or r11, r8, r9
168 L(L10): srd r8, u1, cnt
169 sld r9, u0, tnc
170 ld u1, 24(up)
171 addi up, up, 32
172 std r10, 24(rp)
173 addi rp, rp, 32
174 or r10, r12, r7
175 bdnz L(top)
176
177 ALIGN(32)
178 L(end): srd r12, u0, cnt
179 sld r7, u1, tnc
180 std r11, 0(rp)
181 L(cj4): or r11, r8, r9
182 srd r8, u1, cnt
183 std r10, 8(rp)
184 L(cj3): or r10, r12, r7
185 std r11, 16(rp)
186 L(cj2): std r10, 24(rp)
187 std r8, 32(rp)
188
189 L(ret): ld r31, -8(r1)
190 ld r30, -16(r1)
191 ifdef(`HAVE_ABI_mode32',
192 ` srdi r3, retval, 32
193 mr r4, retval
194 ',` mr r3, retval')
195 blr
196 EPILOGUE()
197