lib1funcs.S revision 1.1 1 1.1 mrg /* Copyright (C) 2000-2013 Free Software Foundation, Inc.
2 1.1 mrg Contributed by James E. Wilson <wilson (at) cygnus.com>.
3 1.1 mrg
4 1.1 mrg This file is part of GCC.
5 1.1 mrg
6 1.1 mrg GCC is free software; you can redistribute it and/or modify
7 1.1 mrg it under the terms of the GNU General Public License as published by
8 1.1 mrg the Free Software Foundation; either version 3, or (at your option)
9 1.1 mrg any later version.
10 1.1 mrg
11 1.1 mrg GCC is distributed in the hope that it will be useful,
12 1.1 mrg but WITHOUT ANY WARRANTY; without even the implied warranty of
13 1.1 mrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 1.1 mrg GNU General Public License for more details.
15 1.1 mrg
16 1.1 mrg Under Section 7 of GPL version 3, you are granted additional
17 1.1 mrg permissions described in the GCC Runtime Library Exception, version
18 1.1 mrg 3.1, as published by the Free Software Foundation.
19 1.1 mrg
20 1.1 mrg You should have received a copy of the GNU General Public License and
21 1.1 mrg a copy of the GCC Runtime Library Exception along with this program;
22 1.1 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 1.1 mrg <http://www.gnu.org/licenses/>. */
24 1.1 mrg
25 1.1 mrg #ifdef L__divxf3
26 1.1 mrg // Compute a 80-bit IEEE double-extended quotient.
27 1.1 mrg //
28 1.1 mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency
29 1.1 mrg // alternative.
30 1.1 mrg //
31 1.1 mrg // farg0 holds the dividend. farg1 holds the divisor.
32 1.1 mrg //
33 1.1 mrg // __divtf3 is an alternate symbol name for backward compatibility.
34 1.1 mrg
35 1.1 mrg .text
36 1.1 mrg .align 16
37 1.1 mrg .global __divxf3
38 1.1 mrg .proc __divxf3
39 1.1 mrg __divxf3:
40 1.1 mrg #ifdef SHARED
41 1.1 mrg .global __divtf3
42 1.1 mrg __divtf3:
43 1.1 mrg #endif
44 1.1 mrg cmp.eq p7, p0 = r0, r0
45 1.1 mrg frcpa.s0 f10, p6 = farg0, farg1
46 1.1 mrg ;;
47 1.1 mrg (p6) cmp.ne p7, p0 = r0, r0
48 1.1 mrg .pred.rel.mutex p6, p7
49 1.1 mrg (p6) fnma.s1 f11 = farg1, f10, f1
50 1.1 mrg (p6) fma.s1 f12 = farg0, f10, f0
51 1.1 mrg ;;
52 1.1 mrg (p6) fma.s1 f13 = f11, f11, f0
53 1.1 mrg (p6) fma.s1 f14 = f11, f11, f11
54 1.1 mrg ;;
55 1.1 mrg (p6) fma.s1 f11 = f13, f13, f11
56 1.1 mrg (p6) fma.s1 f13 = f14, f10, f10
57 1.1 mrg ;;
58 1.1 mrg (p6) fma.s1 f10 = f13, f11, f10
59 1.1 mrg (p6) fnma.s1 f11 = farg1, f12, farg0
60 1.1 mrg ;;
61 1.1 mrg (p6) fma.s1 f11 = f11, f10, f12
62 1.1 mrg (p6) fnma.s1 f12 = farg1, f10, f1
63 1.1 mrg ;;
64 1.1 mrg (p6) fma.s1 f10 = f12, f10, f10
65 1.1 mrg (p6) fnma.s1 f12 = farg1, f11, farg0
66 1.1 mrg ;;
67 1.1 mrg (p6) fma.s0 fret0 = f12, f10, f11
68 1.1 mrg (p7) mov fret0 = f10
69 1.1 mrg br.ret.sptk rp
70 1.1 mrg .endp __divxf3
71 1.1 mrg #endif
72 1.1 mrg
73 1.1 mrg #ifdef L__divdf3
74 1.1 mrg // Compute a 64-bit IEEE double quotient.
75 1.1 mrg //
76 1.1 mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency
77 1.1 mrg // alternative.
78 1.1 mrg //
79 1.1 mrg // farg0 holds the dividend. farg1 holds the divisor.
80 1.1 mrg
81 1.1 mrg .text
82 1.1 mrg .align 16
83 1.1 mrg .global __divdf3
84 1.1 mrg .proc __divdf3
85 1.1 mrg __divdf3:
86 1.1 mrg cmp.eq p7, p0 = r0, r0
87 1.1 mrg frcpa.s0 f10, p6 = farg0, farg1
88 1.1 mrg ;;
89 1.1 mrg (p6) cmp.ne p7, p0 = r0, r0
90 1.1 mrg .pred.rel.mutex p6, p7
91 1.1 mrg (p6) fmpy.s1 f11 = farg0, f10
92 1.1 mrg (p6) fnma.s1 f12 = farg1, f10, f1
93 1.1 mrg ;;
94 1.1 mrg (p6) fma.s1 f11 = f12, f11, f11
95 1.1 mrg (p6) fmpy.s1 f13 = f12, f12
96 1.1 mrg ;;
97 1.1 mrg (p6) fma.s1 f10 = f12, f10, f10
98 1.1 mrg (p6) fma.s1 f11 = f13, f11, f11
99 1.1 mrg ;;
100 1.1 mrg (p6) fmpy.s1 f12 = f13, f13
101 1.1 mrg (p6) fma.s1 f10 = f13, f10, f10
102 1.1 mrg ;;
103 1.1 mrg (p6) fma.d.s1 f11 = f12, f11, f11
104 1.1 mrg (p6) fma.s1 f10 = f12, f10, f10
105 1.1 mrg ;;
106 1.1 mrg (p6) fnma.d.s1 f8 = farg1, f11, farg0
107 1.1 mrg ;;
108 1.1 mrg (p6) fma.d fret0 = f8, f10, f11
109 1.1 mrg (p7) mov fret0 = f10
110 1.1 mrg br.ret.sptk rp
111 1.1 mrg ;;
112 1.1 mrg .endp __divdf3
113 1.1 mrg #endif
114 1.1 mrg
115 1.1 mrg #ifdef L__divsf3
116 1.1 mrg // Compute a 32-bit IEEE float quotient.
117 1.1 mrg //
118 1.1 mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency
119 1.1 mrg // alternative.
120 1.1 mrg //
121 1.1 mrg // farg0 holds the dividend. farg1 holds the divisor.
122 1.1 mrg
123 1.1 mrg .text
124 1.1 mrg .align 16
125 1.1 mrg .global __divsf3
126 1.1 mrg .proc __divsf3
127 1.1 mrg __divsf3:
128 1.1 mrg cmp.eq p7, p0 = r0, r0
129 1.1 mrg frcpa.s0 f10, p6 = farg0, farg1
130 1.1 mrg ;;
131 1.1 mrg (p6) cmp.ne p7, p0 = r0, r0
132 1.1 mrg .pred.rel.mutex p6, p7
133 1.1 mrg (p6) fmpy.s1 f8 = farg0, f10
134 1.1 mrg (p6) fnma.s1 f9 = farg1, f10, f1
135 1.1 mrg ;;
136 1.1 mrg (p6) fma.s1 f8 = f9, f8, f8
137 1.1 mrg (p6) fmpy.s1 f9 = f9, f9
138 1.1 mrg ;;
139 1.1 mrg (p6) fma.s1 f8 = f9, f8, f8
140 1.1 mrg (p6) fmpy.s1 f9 = f9, f9
141 1.1 mrg ;;
142 1.1 mrg (p6) fma.d.s1 f10 = f9, f8, f8
143 1.1 mrg ;;
144 1.1 mrg (p6) fnorm.s.s0 fret0 = f10
145 1.1 mrg (p7) mov fret0 = f10
146 1.1 mrg br.ret.sptk rp
147 1.1 mrg ;;
148 1.1 mrg .endp __divsf3
149 1.1 mrg #endif
150 1.1 mrg
151 1.1 mrg #ifdef L__divdi3
152 1.1 mrg // Compute a 64-bit integer quotient.
153 1.1 mrg //
154 1.1 mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency
155 1.1 mrg // alternative.
156 1.1 mrg //
157 1.1 mrg // in0 holds the dividend. in1 holds the divisor.
158 1.1 mrg
159 1.1 mrg .text
160 1.1 mrg .align 16
161 1.1 mrg .global __divdi3
162 1.1 mrg .proc __divdi3
163 1.1 mrg __divdi3:
164 1.1 mrg .regstk 2,0,0,0
165 1.1 mrg // Transfer inputs to FP registers.
166 1.1 mrg setf.sig f8 = in0
167 1.1 mrg setf.sig f9 = in1
168 1.1 mrg // Check divide by zero.
169 1.1 mrg cmp.ne.unc p0,p7=0,in1
170 1.1 mrg ;;
171 1.1 mrg // Convert the inputs to FP, so that they won't be treated as unsigned.
172 1.1 mrg fcvt.xf f8 = f8
173 1.1 mrg fcvt.xf f9 = f9
174 1.1 mrg (p7) break 1
175 1.1 mrg ;;
176 1.1 mrg // Compute the reciprocal approximation.
177 1.1 mrg frcpa.s1 f10, p6 = f8, f9
178 1.1 mrg ;;
179 1.1 mrg // 3 Newton-Raphson iterations.
180 1.1 mrg (p6) fnma.s1 f11 = f9, f10, f1
181 1.1 mrg (p6) fmpy.s1 f12 = f8, f10
182 1.1 mrg ;;
183 1.1 mrg (p6) fmpy.s1 f13 = f11, f11
184 1.1 mrg (p6) fma.s1 f12 = f11, f12, f12
185 1.1 mrg ;;
186 1.1 mrg (p6) fma.s1 f10 = f11, f10, f10
187 1.1 mrg (p6) fma.s1 f11 = f13, f12, f12
188 1.1 mrg ;;
189 1.1 mrg (p6) fma.s1 f10 = f13, f10, f10
190 1.1 mrg (p6) fnma.s1 f12 = f9, f11, f8
191 1.1 mrg ;;
192 1.1 mrg (p6) fma.s1 f10 = f12, f10, f11
193 1.1 mrg ;;
194 1.1 mrg // Round quotient to an integer.
195 1.1 mrg fcvt.fx.trunc.s1 f10 = f10
196 1.1 mrg ;;
197 1.1 mrg // Transfer result to GP registers.
198 1.1 mrg getf.sig ret0 = f10
199 1.1 mrg br.ret.sptk rp
200 1.1 mrg ;;
201 1.1 mrg .endp __divdi3
202 1.1 mrg #endif
203 1.1 mrg
204 1.1 mrg #ifdef L__moddi3
205 1.1 mrg // Compute a 64-bit integer modulus.
206 1.1 mrg //
207 1.1 mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency
208 1.1 mrg // alternative.
209 1.1 mrg //
210 1.1 mrg // in0 holds the dividend (a). in1 holds the divisor (b).
211 1.1 mrg
212 1.1 mrg .text
213 1.1 mrg .align 16
214 1.1 mrg .global __moddi3
215 1.1 mrg .proc __moddi3
216 1.1 mrg __moddi3:
217 1.1 mrg .regstk 2,0,0,0
218 1.1 mrg // Transfer inputs to FP registers.
219 1.1 mrg setf.sig f14 = in0
220 1.1 mrg setf.sig f9 = in1
221 1.1 mrg // Check divide by zero.
222 1.1 mrg cmp.ne.unc p0,p7=0,in1
223 1.1 mrg ;;
224 1.1 mrg // Convert the inputs to FP, so that they won't be treated as unsigned.
225 1.1 mrg fcvt.xf f8 = f14
226 1.1 mrg fcvt.xf f9 = f9
227 1.1 mrg (p7) break 1
228 1.1 mrg ;;
229 1.1 mrg // Compute the reciprocal approximation.
230 1.1 mrg frcpa.s1 f10, p6 = f8, f9
231 1.1 mrg ;;
232 1.1 mrg // 3 Newton-Raphson iterations.
233 1.1 mrg (p6) fmpy.s1 f12 = f8, f10
234 1.1 mrg (p6) fnma.s1 f11 = f9, f10, f1
235 1.1 mrg ;;
236 1.1 mrg (p6) fma.s1 f12 = f11, f12, f12
237 1.1 mrg (p6) fmpy.s1 f13 = f11, f11
238 1.1 mrg ;;
239 1.1 mrg (p6) fma.s1 f10 = f11, f10, f10
240 1.1 mrg (p6) fma.s1 f11 = f13, f12, f12
241 1.1 mrg ;;
242 1.1 mrg sub in1 = r0, in1
243 1.1 mrg (p6) fma.s1 f10 = f13, f10, f10
244 1.1 mrg (p6) fnma.s1 f12 = f9, f11, f8
245 1.1 mrg ;;
246 1.1 mrg setf.sig f9 = in1
247 1.1 mrg (p6) fma.s1 f10 = f12, f10, f11
248 1.1 mrg ;;
249 1.1 mrg fcvt.fx.trunc.s1 f10 = f10
250 1.1 mrg ;;
251 1.1 mrg // r = q * (-b) + a
252 1.1 mrg xma.l f10 = f10, f9, f14
253 1.1 mrg ;;
254 1.1 mrg // Transfer result to GP registers.
255 1.1 mrg getf.sig ret0 = f10
256 1.1 mrg br.ret.sptk rp
257 1.1 mrg ;;
258 1.1 mrg .endp __moddi3
259 1.1 mrg #endif
260 1.1 mrg
261 1.1 mrg #ifdef L__udivdi3
262 1.1 mrg // Compute a 64-bit unsigned integer quotient.
263 1.1 mrg //
264 1.1 mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency
265 1.1 mrg // alternative.
266 1.1 mrg //
267 1.1 mrg // in0 holds the dividend. in1 holds the divisor.
268 1.1 mrg
269 1.1 mrg .text
270 1.1 mrg .align 16
271 1.1 mrg .global __udivdi3
272 1.1 mrg .proc __udivdi3
273 1.1 mrg __udivdi3:
274 1.1 mrg .regstk 2,0,0,0
275 1.1 mrg // Transfer inputs to FP registers.
276 1.1 mrg setf.sig f8 = in0
277 1.1 mrg setf.sig f9 = in1
278 1.1 mrg // Check divide by zero.
279 1.1 mrg cmp.ne.unc p0,p7=0,in1
280 1.1 mrg ;;
281 1.1 mrg // Convert the inputs to FP, to avoid FP software-assist faults.
282 1.1 mrg fcvt.xuf.s1 f8 = f8
283 1.1 mrg fcvt.xuf.s1 f9 = f9
284 1.1 mrg (p7) break 1
285 1.1 mrg ;;
286 1.1 mrg // Compute the reciprocal approximation.
287 1.1 mrg frcpa.s1 f10, p6 = f8, f9
288 1.1 mrg ;;
289 1.1 mrg // 3 Newton-Raphson iterations.
290 1.1 mrg (p6) fnma.s1 f11 = f9, f10, f1
291 1.1 mrg (p6) fmpy.s1 f12 = f8, f10
292 1.1 mrg ;;
293 1.1 mrg (p6) fmpy.s1 f13 = f11, f11
294 1.1 mrg (p6) fma.s1 f12 = f11, f12, f12
295 1.1 mrg ;;
296 1.1 mrg (p6) fma.s1 f10 = f11, f10, f10
297 1.1 mrg (p6) fma.s1 f11 = f13, f12, f12
298 1.1 mrg ;;
299 1.1 mrg (p6) fma.s1 f10 = f13, f10, f10
300 1.1 mrg (p6) fnma.s1 f12 = f9, f11, f8
301 1.1 mrg ;;
302 1.1 mrg (p6) fma.s1 f10 = f12, f10, f11
303 1.1 mrg ;;
304 1.1 mrg // Round quotient to an unsigned integer.
305 1.1 mrg fcvt.fxu.trunc.s1 f10 = f10
306 1.1 mrg ;;
307 1.1 mrg // Transfer result to GP registers.
308 1.1 mrg getf.sig ret0 = f10
309 1.1 mrg br.ret.sptk rp
310 1.1 mrg ;;
311 1.1 mrg .endp __udivdi3
312 1.1 mrg #endif
313 1.1 mrg
314 1.1 mrg #ifdef L__umoddi3
315 1.1 mrg // Compute a 64-bit unsigned integer modulus.
316 1.1 mrg //
317 1.1 mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency
318 1.1 mrg // alternative.
319 1.1 mrg //
320 1.1 mrg // in0 holds the dividend (a). in1 holds the divisor (b).
321 1.1 mrg
322 1.1 mrg .text
323 1.1 mrg .align 16
324 1.1 mrg .global __umoddi3
325 1.1 mrg .proc __umoddi3
326 1.1 mrg __umoddi3:
327 1.1 mrg .regstk 2,0,0,0
328 1.1 mrg // Transfer inputs to FP registers.
329 1.1 mrg setf.sig f14 = in0
330 1.1 mrg setf.sig f9 = in1
331 1.1 mrg // Check divide by zero.
332 1.1 mrg cmp.ne.unc p0,p7=0,in1
333 1.1 mrg ;;
334 1.1 mrg // Convert the inputs to FP, to avoid FP software assist faults.
335 1.1 mrg fcvt.xuf.s1 f8 = f14
336 1.1 mrg fcvt.xuf.s1 f9 = f9
337 1.1 mrg (p7) break 1;
338 1.1 mrg ;;
339 1.1 mrg // Compute the reciprocal approximation.
340 1.1 mrg frcpa.s1 f10, p6 = f8, f9
341 1.1 mrg ;;
342 1.1 mrg // 3 Newton-Raphson iterations.
343 1.1 mrg (p6) fmpy.s1 f12 = f8, f10
344 1.1 mrg (p6) fnma.s1 f11 = f9, f10, f1
345 1.1 mrg ;;
346 1.1 mrg (p6) fma.s1 f12 = f11, f12, f12
347 1.1 mrg (p6) fmpy.s1 f13 = f11, f11
348 1.1 mrg ;;
349 1.1 mrg (p6) fma.s1 f10 = f11, f10, f10
350 1.1 mrg (p6) fma.s1 f11 = f13, f12, f12
351 1.1 mrg ;;
352 1.1 mrg sub in1 = r0, in1
353 1.1 mrg (p6) fma.s1 f10 = f13, f10, f10
354 1.1 mrg (p6) fnma.s1 f12 = f9, f11, f8
355 1.1 mrg ;;
356 1.1 mrg setf.sig f9 = in1
357 1.1 mrg (p6) fma.s1 f10 = f12, f10, f11
358 1.1 mrg ;;
359 1.1 mrg // Round quotient to an unsigned integer.
360 1.1 mrg fcvt.fxu.trunc.s1 f10 = f10
361 1.1 mrg ;;
362 1.1 mrg // r = q * (-b) + a
363 1.1 mrg xma.l f10 = f10, f9, f14
364 1.1 mrg ;;
365 1.1 mrg // Transfer result to GP registers.
366 1.1 mrg getf.sig ret0 = f10
367 1.1 mrg br.ret.sptk rp
368 1.1 mrg ;;
369 1.1 mrg .endp __umoddi3
370 1.1 mrg #endif
371 1.1 mrg
372 1.1 mrg #ifdef L__divsi3
373 1.1 mrg // Compute a 32-bit integer quotient.
374 1.1 mrg //
375 1.1 mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency
376 1.1 mrg // alternative.
377 1.1 mrg //
378 1.1 mrg // in0 holds the dividend. in1 holds the divisor.
379 1.1 mrg
380 1.1 mrg .text
381 1.1 mrg .align 16
382 1.1 mrg .global __divsi3
383 1.1 mrg .proc __divsi3
384 1.1 mrg __divsi3:
385 1.1 mrg .regstk 2,0,0,0
386 1.1 mrg // Check divide by zero.
387 1.1 mrg cmp.ne.unc p0,p7=0,in1
388 1.1 mrg sxt4 in0 = in0
389 1.1 mrg sxt4 in1 = in1
390 1.1 mrg ;;
391 1.1 mrg setf.sig f8 = in0
392 1.1 mrg setf.sig f9 = in1
393 1.1 mrg (p7) break 1
394 1.1 mrg ;;
395 1.1 mrg mov r2 = 0x0ffdd
396 1.1 mrg fcvt.xf f8 = f8
397 1.1 mrg fcvt.xf f9 = f9
398 1.1 mrg ;;
399 1.1 mrg setf.exp f11 = r2
400 1.1 mrg frcpa.s1 f10, p6 = f8, f9
401 1.1 mrg ;;
402 1.1 mrg (p6) fmpy.s1 f8 = f8, f10
403 1.1 mrg (p6) fnma.s1 f9 = f9, f10, f1
404 1.1 mrg ;;
405 1.1 mrg (p6) fma.s1 f8 = f9, f8, f8
406 1.1 mrg (p6) fma.s1 f9 = f9, f9, f11
407 1.1 mrg ;;
408 1.1 mrg (p6) fma.s1 f10 = f9, f8, f8
409 1.1 mrg ;;
410 1.1 mrg fcvt.fx.trunc.s1 f10 = f10
411 1.1 mrg ;;
412 1.1 mrg getf.sig ret0 = f10
413 1.1 mrg br.ret.sptk rp
414 1.1 mrg ;;
415 1.1 mrg .endp __divsi3
416 1.1 mrg #endif
417 1.1 mrg
418 1.1 mrg #ifdef L__modsi3
419 1.1 mrg // Compute a 32-bit integer modulus.
420 1.1 mrg //
421 1.1 mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency
422 1.1 mrg // alternative.
423 1.1 mrg //
424 1.1 mrg // in0 holds the dividend. in1 holds the divisor.
425 1.1 mrg
426 1.1 mrg .text
427 1.1 mrg .align 16
428 1.1 mrg .global __modsi3
429 1.1 mrg .proc __modsi3
430 1.1 mrg __modsi3:
431 1.1 mrg .regstk 2,0,0,0
432 1.1 mrg mov r2 = 0x0ffdd
433 1.1 mrg sxt4 in0 = in0
434 1.1 mrg sxt4 in1 = in1
435 1.1 mrg ;;
436 1.1 mrg setf.sig f13 = r32
437 1.1 mrg setf.sig f9 = r33
438 1.1 mrg // Check divide by zero.
439 1.1 mrg cmp.ne.unc p0,p7=0,in1
440 1.1 mrg ;;
441 1.1 mrg sub in1 = r0, in1
442 1.1 mrg fcvt.xf f8 = f13
443 1.1 mrg fcvt.xf f9 = f9
444 1.1 mrg ;;
445 1.1 mrg setf.exp f11 = r2
446 1.1 mrg frcpa.s1 f10, p6 = f8, f9
447 1.1 mrg (p7) break 1
448 1.1 mrg ;;
449 1.1 mrg (p6) fmpy.s1 f12 = f8, f10
450 1.1 mrg (p6) fnma.s1 f10 = f9, f10, f1
451 1.1 mrg ;;
452 1.1 mrg setf.sig f9 = in1
453 1.1 mrg (p6) fma.s1 f12 = f10, f12, f12
454 1.1 mrg (p6) fma.s1 f10 = f10, f10, f11
455 1.1 mrg ;;
456 1.1 mrg (p6) fma.s1 f10 = f10, f12, f12
457 1.1 mrg ;;
458 1.1 mrg fcvt.fx.trunc.s1 f10 = f10
459 1.1 mrg ;;
460 1.1 mrg xma.l f10 = f10, f9, f13
461 1.1 mrg ;;
462 1.1 mrg getf.sig ret0 = f10
463 1.1 mrg br.ret.sptk rp
464 1.1 mrg ;;
465 1.1 mrg .endp __modsi3
466 1.1 mrg #endif
467 1.1 mrg
468 1.1 mrg #ifdef L__udivsi3
469 1.1 mrg // Compute a 32-bit unsigned integer quotient.
470 1.1 mrg //
471 1.1 mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency
472 1.1 mrg // alternative.
473 1.1 mrg //
474 1.1 mrg // in0 holds the dividend. in1 holds the divisor.
475 1.1 mrg
476 1.1 mrg .text
477 1.1 mrg .align 16
478 1.1 mrg .global __udivsi3
479 1.1 mrg .proc __udivsi3
480 1.1 mrg __udivsi3:
481 1.1 mrg .regstk 2,0,0,0
482 1.1 mrg mov r2 = 0x0ffdd
483 1.1 mrg zxt4 in0 = in0
484 1.1 mrg zxt4 in1 = in1
485 1.1 mrg ;;
486 1.1 mrg setf.sig f8 = in0
487 1.1 mrg setf.sig f9 = in1
488 1.1 mrg // Check divide by zero.
489 1.1 mrg cmp.ne.unc p0,p7=0,in1
490 1.1 mrg ;;
491 1.1 mrg fcvt.xf f8 = f8
492 1.1 mrg fcvt.xf f9 = f9
493 1.1 mrg (p7) break 1
494 1.1 mrg ;;
495 1.1 mrg setf.exp f11 = r2
496 1.1 mrg frcpa.s1 f10, p6 = f8, f9
497 1.1 mrg ;;
498 1.1 mrg (p6) fmpy.s1 f8 = f8, f10
499 1.1 mrg (p6) fnma.s1 f9 = f9, f10, f1
500 1.1 mrg ;;
501 1.1 mrg (p6) fma.s1 f8 = f9, f8, f8
502 1.1 mrg (p6) fma.s1 f9 = f9, f9, f11
503 1.1 mrg ;;
504 1.1 mrg (p6) fma.s1 f10 = f9, f8, f8
505 1.1 mrg ;;
506 1.1 mrg fcvt.fxu.trunc.s1 f10 = f10
507 1.1 mrg ;;
508 1.1 mrg getf.sig ret0 = f10
509 1.1 mrg br.ret.sptk rp
510 1.1 mrg ;;
511 1.1 mrg .endp __udivsi3
512 1.1 mrg #endif
513 1.1 mrg
514 1.1 mrg #ifdef L__umodsi3
515 1.1 mrg // Compute a 32-bit unsigned integer modulus.
516 1.1 mrg //
517 1.1 mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency
518 1.1 mrg // alternative.
519 1.1 mrg //
520 1.1 mrg // in0 holds the dividend. in1 holds the divisor.
521 1.1 mrg
522 1.1 mrg .text
523 1.1 mrg .align 16
524 1.1 mrg .global __umodsi3
525 1.1 mrg .proc __umodsi3
526 1.1 mrg __umodsi3:
527 1.1 mrg .regstk 2,0,0,0
528 1.1 mrg mov r2 = 0x0ffdd
529 1.1 mrg zxt4 in0 = in0
530 1.1 mrg zxt4 in1 = in1
531 1.1 mrg ;;
532 1.1 mrg setf.sig f13 = in0
533 1.1 mrg setf.sig f9 = in1
534 1.1 mrg // Check divide by zero.
535 1.1 mrg cmp.ne.unc p0,p7=0,in1
536 1.1 mrg ;;
537 1.1 mrg sub in1 = r0, in1
538 1.1 mrg fcvt.xf f8 = f13
539 1.1 mrg fcvt.xf f9 = f9
540 1.1 mrg ;;
541 1.1 mrg setf.exp f11 = r2
542 1.1 mrg frcpa.s1 f10, p6 = f8, f9
543 1.1 mrg (p7) break 1;
544 1.1 mrg ;;
545 1.1 mrg (p6) fmpy.s1 f12 = f8, f10
546 1.1 mrg (p6) fnma.s1 f10 = f9, f10, f1
547 1.1 mrg ;;
548 1.1 mrg setf.sig f9 = in1
549 1.1 mrg (p6) fma.s1 f12 = f10, f12, f12
550 1.1 mrg (p6) fma.s1 f10 = f10, f10, f11
551 1.1 mrg ;;
552 1.1 mrg (p6) fma.s1 f10 = f10, f12, f12
553 1.1 mrg ;;
554 1.1 mrg fcvt.fxu.trunc.s1 f10 = f10
555 1.1 mrg ;;
556 1.1 mrg xma.l f10 = f10, f9, f13
557 1.1 mrg ;;
558 1.1 mrg getf.sig ret0 = f10
559 1.1 mrg br.ret.sptk rp
560 1.1 mrg ;;
561 1.1 mrg .endp __umodsi3
562 1.1 mrg #endif
563 1.1 mrg
564 1.1 mrg #ifdef L__save_stack_nonlocal
565 1.1 mrg // Notes on save/restore stack nonlocal: We read ar.bsp but write
566 1.1 mrg // ar.bspstore. This is because ar.bsp can be read at all times
567 1.1 mrg // (independent of the RSE mode) but since it's read-only we need to
568 1.1 mrg // restore the value via ar.bspstore. This is OK because
569 1.1 mrg // ar.bsp==ar.bspstore after executing "flushrs".
570 1.1 mrg
571 1.1 mrg // void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)
572 1.1 mrg
573 1.1 mrg .text
574 1.1 mrg .align 16
575 1.1 mrg .global __ia64_save_stack_nonlocal
576 1.1 mrg .proc __ia64_save_stack_nonlocal
577 1.1 mrg __ia64_save_stack_nonlocal:
578 1.1 mrg { .mmf
579 1.1 mrg alloc r18 = ar.pfs, 2, 0, 0, 0
580 1.1 mrg mov r19 = ar.rsc
581 1.1 mrg ;;
582 1.1 mrg }
583 1.1 mrg { .mmi
584 1.1 mrg flushrs
585 1.1 mrg st8 [in0] = in1, 24
586 1.1 mrg and r19 = 0x1c, r19
587 1.1 mrg ;;
588 1.1 mrg }
589 1.1 mrg { .mmi
590 1.1 mrg st8 [in0] = r18, -16
591 1.1 mrg mov ar.rsc = r19
592 1.1 mrg or r19 = 0x3, r19
593 1.1 mrg ;;
594 1.1 mrg }
595 1.1 mrg { .mmi
596 1.1 mrg mov r16 = ar.bsp
597 1.1 mrg mov r17 = ar.rnat
598 1.1 mrg adds r2 = 8, in0
599 1.1 mrg ;;
600 1.1 mrg }
601 1.1 mrg { .mmi
602 1.1 mrg st8 [in0] = r16
603 1.1 mrg st8 [r2] = r17
604 1.1 mrg }
605 1.1 mrg { .mib
606 1.1 mrg mov ar.rsc = r19
607 1.1 mrg br.ret.sptk.few rp
608 1.1 mrg ;;
609 1.1 mrg }
610 1.1 mrg .endp __ia64_save_stack_nonlocal
611 1.1 mrg #endif
612 1.1 mrg
613 1.1 mrg #ifdef L__nonlocal_goto
614 1.1 mrg // void __ia64_nonlocal_goto(void *target_label, void *save_area,
615 1.1 mrg // void *static_chain);
616 1.1 mrg
617 1.1 mrg .text
618 1.1 mrg .align 16
619 1.1 mrg .global __ia64_nonlocal_goto
620 1.1 mrg .proc __ia64_nonlocal_goto
621 1.1 mrg __ia64_nonlocal_goto:
622 1.1 mrg { .mmi
623 1.1 mrg alloc r20 = ar.pfs, 3, 0, 0, 0
624 1.1 mrg ld8 r12 = [in1], 8
625 1.1 mrg mov.ret.sptk rp = in0, .L0
626 1.1 mrg ;;
627 1.1 mrg }
628 1.1 mrg { .mmf
629 1.1 mrg ld8 r16 = [in1], 8
630 1.1 mrg mov r19 = ar.rsc
631 1.1 mrg ;;
632 1.1 mrg }
633 1.1 mrg { .mmi
634 1.1 mrg flushrs
635 1.1 mrg ld8 r17 = [in1], 8
636 1.1 mrg and r19 = 0x1c, r19
637 1.1 mrg ;;
638 1.1 mrg }
639 1.1 mrg { .mmi
640 1.1 mrg ld8 r18 = [in1]
641 1.1 mrg mov ar.rsc = r19
642 1.1 mrg or r19 = 0x3, r19
643 1.1 mrg ;;
644 1.1 mrg }
645 1.1 mrg { .mmi
646 1.1 mrg mov ar.bspstore = r16
647 1.1 mrg ;;
648 1.1 mrg mov ar.rnat = r17
649 1.1 mrg ;;
650 1.1 mrg }
651 1.1 mrg { .mmi
652 1.1 mrg loadrs
653 1.1 mrg invala
654 1.1 mrg mov r15 = in2
655 1.1 mrg ;;
656 1.1 mrg }
657 1.1 mrg .L0: { .mib
658 1.1 mrg mov ar.rsc = r19
659 1.1 mrg mov ar.pfs = r18
660 1.1 mrg br.ret.sptk.few rp
661 1.1 mrg ;;
662 1.1 mrg }
663 1.1 mrg .endp __ia64_nonlocal_goto
664 1.1 mrg #endif
665 1.1 mrg
666 1.1 mrg #ifdef L__restore_stack_nonlocal
667 1.1 mrg // This is mostly the same as nonlocal_goto above.
668 1.1 mrg // ??? This has not been tested yet.
669 1.1 mrg
670 1.1 mrg // void __ia64_restore_stack_nonlocal(void *save_area)
671 1.1 mrg
672 1.1 mrg .text
673 1.1 mrg .align 16
674 1.1 mrg .global __ia64_restore_stack_nonlocal
675 1.1 mrg .proc __ia64_restore_stack_nonlocal
676 1.1 mrg __ia64_restore_stack_nonlocal:
677 1.1 mrg { .mmf
678 1.1 mrg alloc r20 = ar.pfs, 4, 0, 0, 0
679 1.1 mrg ld8 r12 = [in0], 8
680 1.1 mrg ;;
681 1.1 mrg }
682 1.1 mrg { .mmb
683 1.1 mrg ld8 r16=[in0], 8
684 1.1 mrg mov r19 = ar.rsc
685 1.1 mrg ;;
686 1.1 mrg }
687 1.1 mrg { .mmi
688 1.1 mrg flushrs
689 1.1 mrg ld8 r17 = [in0], 8
690 1.1 mrg and r19 = 0x1c, r19
691 1.1 mrg ;;
692 1.1 mrg }
693 1.1 mrg { .mmf
694 1.1 mrg ld8 r18 = [in0]
695 1.1 mrg mov ar.rsc = r19
696 1.1 mrg ;;
697 1.1 mrg }
698 1.1 mrg { .mmi
699 1.1 mrg mov ar.bspstore = r16
700 1.1 mrg ;;
701 1.1 mrg mov ar.rnat = r17
702 1.1 mrg or r19 = 0x3, r19
703 1.1 mrg ;;
704 1.1 mrg }
705 1.1 mrg { .mmf
706 1.1 mrg loadrs
707 1.1 mrg invala
708 1.1 mrg ;;
709 1.1 mrg }
710 1.1 mrg .L0: { .mib
711 1.1 mrg mov ar.rsc = r19
712 1.1 mrg mov ar.pfs = r18
713 1.1 mrg br.ret.sptk.few rp
714 1.1 mrg ;;
715 1.1 mrg }
716 1.1 mrg .endp __ia64_restore_stack_nonlocal
717 1.1 mrg #endif
718 1.1 mrg
719 1.1 mrg #ifdef L__trampoline
720 1.1 mrg // Implement the nested function trampoline. This is out of line
721 1.1 mrg // so that we don't have to bother with flushing the icache, as
722 1.1 mrg // well as making the on-stack trampoline smaller.
723 1.1 mrg //
724 1.1 mrg // The trampoline has the following form:
725 1.1 mrg //
726 1.1 mrg // +-------------------+ >
727 1.1 mrg // TRAMP: | __ia64_trampoline | |
728 1.1 mrg // +-------------------+ > fake function descriptor
729 1.1 mrg // | TRAMP+16 | |
730 1.1 mrg // +-------------------+ >
731 1.1 mrg // | target descriptor |
732 1.1 mrg // +-------------------+
733 1.1 mrg // | static link |
734 1.1 mrg // +-------------------+
735 1.1 mrg
736 1.1 mrg .text
737 1.1 mrg .align 16
738 1.1 mrg .global __ia64_trampoline
739 1.1 mrg .proc __ia64_trampoline
740 1.1 mrg __ia64_trampoline:
741 1.1 mrg { .mmi
742 1.1 mrg ld8 r2 = [r1], 8
743 1.1 mrg ;;
744 1.1 mrg ld8 r15 = [r1]
745 1.1 mrg }
746 1.1 mrg { .mmi
747 1.1 mrg ld8 r3 = [r2], 8
748 1.1 mrg ;;
749 1.1 mrg ld8 r1 = [r2]
750 1.1 mrg mov b6 = r3
751 1.1 mrg }
752 1.1 mrg { .bbb
753 1.1 mrg br.sptk.many b6
754 1.1 mrg ;;
755 1.1 mrg }
756 1.1 mrg .endp __ia64_trampoline
757 1.1 mrg #endif
758 1.1 mrg
759 1.1 mrg #ifdef SHARED
760 1.1 mrg // Thunks for backward compatibility.
761 1.1 mrg #ifdef L_fixtfdi
762 1.1 mrg .text
763 1.1 mrg .align 16
764 1.1 mrg .global __fixtfti
765 1.1 mrg .proc __fixtfti
766 1.1 mrg __fixtfti:
767 1.1 mrg { .bbb
768 1.1 mrg br.sptk.many __fixxfti
769 1.1 mrg ;;
770 1.1 mrg }
771 1.1 mrg .endp __fixtfti
772 1.1 mrg #endif
773 1.1 mrg #ifdef L_fixunstfdi
774 1.1 mrg .align 16
775 1.1 mrg .global __fixunstfti
776 1.1 mrg .proc __fixunstfti
777 1.1 mrg __fixunstfti:
778 1.1 mrg { .bbb
779 1.1 mrg br.sptk.many __fixunsxfti
780 1.1 mrg ;;
781 1.1 mrg }
782 1.1 mrg .endp __fixunstfti
783 1.1 mrg #endif
784 1.1 mrg #ifdef L_floatditf
785 1.1 mrg .align 16
786 1.1 mrg .global __floattitf
787 1.1 mrg .proc __floattitf
788 1.1 mrg __floattitf:
789 1.1 mrg { .bbb
790 1.1 mrg br.sptk.many __floattixf
791 1.1 mrg ;;
792 1.1 mrg }
793 1.1 mrg .endp __floattitf
794 1.1 mrg #endif
795 1.1 mrg #endif
796