1 1.1.1.11 mrg /* Copyright (C) 2000-2024 Free Software Foundation, Inc. 2 1.1 mrg Contributed by James E. Wilson <wilson (at) cygnus.com>. 3 1.1 mrg 4 1.1 mrg This file is part of GCC. 5 1.1 mrg 6 1.1 mrg GCC is free software; you can redistribute it and/or modify 7 1.1 mrg it under the terms of the GNU General Public License as published by 8 1.1 mrg the Free Software Foundation; either version 3, or (at your option) 9 1.1 mrg any later version. 10 1.1 mrg 11 1.1 mrg GCC is distributed in the hope that it will be useful, 12 1.1 mrg but WITHOUT ANY WARRANTY; without even the implied warranty of 13 1.1 mrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 1.1 mrg GNU General Public License for more details. 15 1.1 mrg 16 1.1 mrg Under Section 7 of GPL version 3, you are granted additional 17 1.1 mrg permissions described in the GCC Runtime Library Exception, version 18 1.1 mrg 3.1, as published by the Free Software Foundation. 19 1.1 mrg 20 1.1 mrg You should have received a copy of the GNU General Public License and 21 1.1 mrg a copy of the GCC Runtime Library Exception along with this program; 22 1.1 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 23 1.1 mrg <http://www.gnu.org/licenses/>. */ 24 1.1 mrg 25 1.1 mrg #ifdef L__divxf3 26 1.1 mrg // Compute a 80-bit IEEE double-extended quotient. 27 1.1 mrg // 28 1.1 mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency 29 1.1 mrg // alternative. 30 1.1 mrg // 31 1.1 mrg // farg0 holds the dividend. farg1 holds the divisor. 32 1.1 mrg // 33 1.1 mrg // __divtf3 is an alternate symbol name for backward compatibility. 34 1.1 mrg 35 1.1 mrg .text 36 1.1 mrg .align 16 37 1.1 mrg .global __divxf3 38 1.1 mrg .proc __divxf3 39 1.1 mrg __divxf3: 40 1.1 mrg #ifdef SHARED 41 1.1 mrg .global __divtf3 42 1.1 mrg __divtf3: 43 1.1 mrg #endif 44 1.1 mrg cmp.eq p7, p0 = r0, r0 45 1.1 mrg frcpa.s0 f10, p6 = farg0, farg1 46 1.1 mrg ;; 47 1.1 mrg (p6) cmp.ne p7, p0 = r0, r0 48 1.1 mrg .pred.rel.mutex p6, p7 49 1.1 mrg (p6) fnma.s1 f11 = farg1, f10, f1 50 1.1 mrg (p6) fma.s1 f12 = farg0, f10, f0 51 1.1 mrg ;; 52 1.1 mrg (p6) fma.s1 f13 = f11, f11, f0 53 1.1 mrg (p6) fma.s1 f14 = f11, f11, f11 54 1.1 mrg ;; 55 1.1 mrg (p6) fma.s1 f11 = f13, f13, f11 56 1.1 mrg (p6) fma.s1 f13 = f14, f10, f10 57 1.1 mrg ;; 58 1.1 mrg (p6) fma.s1 f10 = f13, f11, f10 59 1.1 mrg (p6) fnma.s1 f11 = farg1, f12, farg0 60 1.1 mrg ;; 61 1.1 mrg (p6) fma.s1 f11 = f11, f10, f12 62 1.1 mrg (p6) fnma.s1 f12 = farg1, f10, f1 63 1.1 mrg ;; 64 1.1 mrg (p6) fma.s1 f10 = f12, f10, f10 65 1.1 mrg (p6) fnma.s1 f12 = farg1, f11, farg0 66 1.1 mrg ;; 67 1.1 mrg (p6) fma.s0 fret0 = f12, f10, f11 68 1.1 mrg (p7) mov fret0 = f10 69 1.1 mrg br.ret.sptk rp 70 1.1 mrg .endp __divxf3 71 1.1 mrg #endif 72 1.1 mrg 73 1.1 mrg #ifdef L__divdf3 74 1.1 mrg // Compute a 64-bit IEEE double quotient. 75 1.1 mrg // 76 1.1 mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency 77 1.1 mrg // alternative. 78 1.1 mrg // 79 1.1 mrg // farg0 holds the dividend. farg1 holds the divisor. 80 1.1 mrg 81 1.1 mrg .text 82 1.1 mrg .align 16 83 1.1 mrg .global __divdf3 84 1.1 mrg .proc __divdf3 85 1.1 mrg __divdf3: 86 1.1 mrg cmp.eq p7, p0 = r0, r0 87 1.1 mrg frcpa.s0 f10, p6 = farg0, farg1 88 1.1 mrg ;; 89 1.1 mrg (p6) cmp.ne p7, p0 = r0, r0 90 1.1 mrg .pred.rel.mutex p6, p7 91 1.1 mrg (p6) fmpy.s1 f11 = farg0, f10 92 1.1 mrg (p6) fnma.s1 f12 = farg1, f10, f1 93 1.1 mrg ;; 94 1.1 mrg (p6) fma.s1 f11 = f12, f11, f11 95 1.1 mrg (p6) fmpy.s1 f13 = f12, f12 96 1.1 mrg ;; 97 1.1 mrg (p6) fma.s1 f10 = f12, f10, f10 98 1.1 mrg (p6) fma.s1 f11 = f13, f11, f11 99 1.1 mrg ;; 100 1.1 mrg (p6) fmpy.s1 f12 = f13, f13 101 1.1 mrg (p6) fma.s1 f10 = f13, f10, f10 102 1.1 mrg ;; 103 1.1 mrg (p6) fma.d.s1 f11 = f12, f11, f11 104 1.1 mrg (p6) fma.s1 f10 = f12, f10, f10 105 1.1 mrg ;; 106 1.1 mrg (p6) fnma.d.s1 f8 = farg1, f11, farg0 107 1.1 mrg ;; 108 1.1 mrg (p6) fma.d fret0 = f8, f10, f11 109 1.1 mrg (p7) mov fret0 = f10 110 1.1 mrg br.ret.sptk rp 111 1.1 mrg ;; 112 1.1 mrg .endp __divdf3 113 1.1 mrg #endif 114 1.1 mrg 115 1.1 mrg #ifdef L__divsf3 116 1.1 mrg // Compute a 32-bit IEEE float quotient. 117 1.1 mrg // 118 1.1 mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency 119 1.1 mrg // alternative. 120 1.1 mrg // 121 1.1 mrg // farg0 holds the dividend. farg1 holds the divisor. 122 1.1 mrg 123 1.1 mrg .text 124 1.1 mrg .align 16 125 1.1 mrg .global __divsf3 126 1.1 mrg .proc __divsf3 127 1.1 mrg __divsf3: 128 1.1 mrg cmp.eq p7, p0 = r0, r0 129 1.1 mrg frcpa.s0 f10, p6 = farg0, farg1 130 1.1 mrg ;; 131 1.1 mrg (p6) cmp.ne p7, p0 = r0, r0 132 1.1 mrg .pred.rel.mutex p6, p7 133 1.1 mrg (p6) fmpy.s1 f8 = farg0, f10 134 1.1 mrg (p6) fnma.s1 f9 = farg1, f10, f1 135 1.1 mrg ;; 136 1.1 mrg (p6) fma.s1 f8 = f9, f8, f8 137 1.1 mrg (p6) fmpy.s1 f9 = f9, f9 138 1.1 mrg ;; 139 1.1 mrg (p6) fma.s1 f8 = f9, f8, f8 140 1.1 mrg (p6) fmpy.s1 f9 = f9, f9 141 1.1 mrg ;; 142 1.1 mrg (p6) fma.d.s1 f10 = f9, f8, f8 143 1.1 mrg ;; 144 1.1 mrg (p6) fnorm.s.s0 fret0 = f10 145 1.1 mrg (p7) mov fret0 = f10 146 1.1 mrg br.ret.sptk rp 147 1.1 mrg ;; 148 1.1 mrg .endp __divsf3 149 1.1 mrg #endif 150 1.1 mrg 151 1.1 mrg #ifdef L__divdi3 152 1.1 mrg // Compute a 64-bit integer quotient. 153 1.1 mrg // 154 1.1 mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency 155 1.1 mrg // alternative. 156 1.1 mrg // 157 1.1 mrg // in0 holds the dividend. in1 holds the divisor. 158 1.1 mrg 159 1.1 mrg .text 160 1.1 mrg .align 16 161 1.1 mrg .global __divdi3 162 1.1 mrg .proc __divdi3 163 1.1 mrg __divdi3: 164 1.1 mrg .regstk 2,0,0,0 165 1.1 mrg // Transfer inputs to FP registers. 166 1.1 mrg setf.sig f8 = in0 167 1.1 mrg setf.sig f9 = in1 168 1.1 mrg // Check divide by zero. 169 1.1 mrg cmp.ne.unc p0,p7=0,in1 170 1.1 mrg ;; 171 1.1 mrg // Convert the inputs to FP, so that they won't be treated as unsigned. 172 1.1 mrg fcvt.xf f8 = f8 173 1.1 mrg fcvt.xf f9 = f9 174 1.1 mrg (p7) break 1 175 1.1 mrg ;; 176 1.1 mrg // Compute the reciprocal approximation. 177 1.1 mrg frcpa.s1 f10, p6 = f8, f9 178 1.1 mrg ;; 179 1.1 mrg // 3 Newton-Raphson iterations. 180 1.1 mrg (p6) fnma.s1 f11 = f9, f10, f1 181 1.1 mrg (p6) fmpy.s1 f12 = f8, f10 182 1.1 mrg ;; 183 1.1 mrg (p6) fmpy.s1 f13 = f11, f11 184 1.1 mrg (p6) fma.s1 f12 = f11, f12, f12 185 1.1 mrg ;; 186 1.1 mrg (p6) fma.s1 f10 = f11, f10, f10 187 1.1 mrg (p6) fma.s1 f11 = f13, f12, f12 188 1.1 mrg ;; 189 1.1 mrg (p6) fma.s1 f10 = f13, f10, f10 190 1.1 mrg (p6) fnma.s1 f12 = f9, f11, f8 191 1.1 mrg ;; 192 1.1 mrg (p6) fma.s1 f10 = f12, f10, f11 193 1.1 mrg ;; 194 1.1 mrg // Round quotient to an integer. 195 1.1 mrg fcvt.fx.trunc.s1 f10 = f10 196 1.1 mrg ;; 197 1.1 mrg // Transfer result to GP registers. 198 1.1 mrg getf.sig ret0 = f10 199 1.1 mrg br.ret.sptk rp 200 1.1 mrg ;; 201 1.1 mrg .endp __divdi3 202 1.1 mrg #endif 203 1.1 mrg 204 1.1 mrg #ifdef L__moddi3 205 1.1 mrg // Compute a 64-bit integer modulus. 206 1.1 mrg // 207 1.1 mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency 208 1.1 mrg // alternative. 209 1.1 mrg // 210 1.1 mrg // in0 holds the dividend (a). in1 holds the divisor (b). 211 1.1 mrg 212 1.1 mrg .text 213 1.1 mrg .align 16 214 1.1 mrg .global __moddi3 215 1.1 mrg .proc __moddi3 216 1.1 mrg __moddi3: 217 1.1 mrg .regstk 2,0,0,0 218 1.1 mrg // Transfer inputs to FP registers. 219 1.1 mrg setf.sig f14 = in0 220 1.1 mrg setf.sig f9 = in1 221 1.1 mrg // Check divide by zero. 222 1.1 mrg cmp.ne.unc p0,p7=0,in1 223 1.1 mrg ;; 224 1.1 mrg // Convert the inputs to FP, so that they won't be treated as unsigned. 225 1.1 mrg fcvt.xf f8 = f14 226 1.1 mrg fcvt.xf f9 = f9 227 1.1 mrg (p7) break 1 228 1.1 mrg ;; 229 1.1 mrg // Compute the reciprocal approximation. 230 1.1 mrg frcpa.s1 f10, p6 = f8, f9 231 1.1 mrg ;; 232 1.1 mrg // 3 Newton-Raphson iterations. 233 1.1 mrg (p6) fmpy.s1 f12 = f8, f10 234 1.1 mrg (p6) fnma.s1 f11 = f9, f10, f1 235 1.1 mrg ;; 236 1.1 mrg (p6) fma.s1 f12 = f11, f12, f12 237 1.1 mrg (p6) fmpy.s1 f13 = f11, f11 238 1.1 mrg ;; 239 1.1 mrg (p6) fma.s1 f10 = f11, f10, f10 240 1.1 mrg (p6) fma.s1 f11 = f13, f12, f12 241 1.1 mrg ;; 242 1.1 mrg sub in1 = r0, in1 243 1.1 mrg (p6) fma.s1 f10 = f13, f10, f10 244 1.1 mrg (p6) fnma.s1 f12 = f9, f11, f8 245 1.1 mrg ;; 246 1.1 mrg setf.sig f9 = in1 247 1.1 mrg (p6) fma.s1 f10 = f12, f10, f11 248 1.1 mrg ;; 249 1.1 mrg fcvt.fx.trunc.s1 f10 = f10 250 1.1 mrg ;; 251 1.1 mrg // r = q * (-b) + a 252 1.1 mrg xma.l f10 = f10, f9, f14 253 1.1 mrg ;; 254 1.1 mrg // Transfer result to GP registers. 255 1.1 mrg getf.sig ret0 = f10 256 1.1 mrg br.ret.sptk rp 257 1.1 mrg ;; 258 1.1 mrg .endp __moddi3 259 1.1 mrg #endif 260 1.1 mrg 261 1.1 mrg #ifdef L__udivdi3 262 1.1 mrg // Compute a 64-bit unsigned integer quotient. 263 1.1 mrg // 264 1.1 mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency 265 1.1 mrg // alternative. 266 1.1 mrg // 267 1.1 mrg // in0 holds the dividend. in1 holds the divisor. 268 1.1 mrg 269 1.1 mrg .text 270 1.1 mrg .align 16 271 1.1 mrg .global __udivdi3 272 1.1 mrg .proc __udivdi3 273 1.1 mrg __udivdi3: 274 1.1 mrg .regstk 2,0,0,0 275 1.1 mrg // Transfer inputs to FP registers. 276 1.1 mrg setf.sig f8 = in0 277 1.1 mrg setf.sig f9 = in1 278 1.1 mrg // Check divide by zero. 279 1.1 mrg cmp.ne.unc p0,p7=0,in1 280 1.1 mrg ;; 281 1.1 mrg // Convert the inputs to FP, to avoid FP software-assist faults. 282 1.1 mrg fcvt.xuf.s1 f8 = f8 283 1.1 mrg fcvt.xuf.s1 f9 = f9 284 1.1 mrg (p7) break 1 285 1.1 mrg ;; 286 1.1 mrg // Compute the reciprocal approximation. 287 1.1 mrg frcpa.s1 f10, p6 = f8, f9 288 1.1 mrg ;; 289 1.1 mrg // 3 Newton-Raphson iterations. 290 1.1 mrg (p6) fnma.s1 f11 = f9, f10, f1 291 1.1 mrg (p6) fmpy.s1 f12 = f8, f10 292 1.1 mrg ;; 293 1.1 mrg (p6) fmpy.s1 f13 = f11, f11 294 1.1 mrg (p6) fma.s1 f12 = f11, f12, f12 295 1.1 mrg ;; 296 1.1 mrg (p6) fma.s1 f10 = f11, f10, f10 297 1.1 mrg (p6) fma.s1 f11 = f13, f12, f12 298 1.1 mrg ;; 299 1.1 mrg (p6) fma.s1 f10 = f13, f10, f10 300 1.1 mrg (p6) fnma.s1 f12 = f9, f11, f8 301 1.1 mrg ;; 302 1.1 mrg (p6) fma.s1 f10 = f12, f10, f11 303 1.1 mrg ;; 304 1.1 mrg // Round quotient to an unsigned integer. 305 1.1 mrg fcvt.fxu.trunc.s1 f10 = f10 306 1.1 mrg ;; 307 1.1 mrg // Transfer result to GP registers. 308 1.1 mrg getf.sig ret0 = f10 309 1.1 mrg br.ret.sptk rp 310 1.1 mrg ;; 311 1.1 mrg .endp __udivdi3 312 1.1 mrg #endif 313 1.1 mrg 314 1.1 mrg #ifdef L__umoddi3 315 1.1 mrg // Compute a 64-bit unsigned integer modulus. 316 1.1 mrg // 317 1.1 mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency 318 1.1 mrg // alternative. 319 1.1 mrg // 320 1.1 mrg // in0 holds the dividend (a). in1 holds the divisor (b). 321 1.1 mrg 322 1.1 mrg .text 323 1.1 mrg .align 16 324 1.1 mrg .global __umoddi3 325 1.1 mrg .proc __umoddi3 326 1.1 mrg __umoddi3: 327 1.1 mrg .regstk 2,0,0,0 328 1.1 mrg // Transfer inputs to FP registers. 329 1.1 mrg setf.sig f14 = in0 330 1.1 mrg setf.sig f9 = in1 331 1.1 mrg // Check divide by zero. 332 1.1 mrg cmp.ne.unc p0,p7=0,in1 333 1.1 mrg ;; 334 1.1 mrg // Convert the inputs to FP, to avoid FP software assist faults. 335 1.1 mrg fcvt.xuf.s1 f8 = f14 336 1.1 mrg fcvt.xuf.s1 f9 = f9 337 1.1 mrg (p7) break 1; 338 1.1 mrg ;; 339 1.1 mrg // Compute the reciprocal approximation. 340 1.1 mrg frcpa.s1 f10, p6 = f8, f9 341 1.1 mrg ;; 342 1.1 mrg // 3 Newton-Raphson iterations. 343 1.1 mrg (p6) fmpy.s1 f12 = f8, f10 344 1.1 mrg (p6) fnma.s1 f11 = f9, f10, f1 345 1.1 mrg ;; 346 1.1 mrg (p6) fma.s1 f12 = f11, f12, f12 347 1.1 mrg (p6) fmpy.s1 f13 = f11, f11 348 1.1 mrg ;; 349 1.1 mrg (p6) fma.s1 f10 = f11, f10, f10 350 1.1 mrg (p6) fma.s1 f11 = f13, f12, f12 351 1.1 mrg ;; 352 1.1 mrg sub in1 = r0, in1 353 1.1 mrg (p6) fma.s1 f10 = f13, f10, f10 354 1.1 mrg (p6) fnma.s1 f12 = f9, f11, f8 355 1.1 mrg ;; 356 1.1 mrg setf.sig f9 = in1 357 1.1 mrg (p6) fma.s1 f10 = f12, f10, f11 358 1.1 mrg ;; 359 1.1 mrg // Round quotient to an unsigned integer. 360 1.1 mrg fcvt.fxu.trunc.s1 f10 = f10 361 1.1 mrg ;; 362 1.1 mrg // r = q * (-b) + a 363 1.1 mrg xma.l f10 = f10, f9, f14 364 1.1 mrg ;; 365 1.1 mrg // Transfer result to GP registers. 366 1.1 mrg getf.sig ret0 = f10 367 1.1 mrg br.ret.sptk rp 368 1.1 mrg ;; 369 1.1 mrg .endp __umoddi3 370 1.1 mrg #endif 371 1.1 mrg 372 1.1 mrg #ifdef L__divsi3 373 1.1 mrg // Compute a 32-bit integer quotient. 374 1.1 mrg // 375 1.1 mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency 376 1.1 mrg // alternative. 377 1.1 mrg // 378 1.1 mrg // in0 holds the dividend. in1 holds the divisor. 379 1.1 mrg 380 1.1 mrg .text 381 1.1 mrg .align 16 382 1.1 mrg .global __divsi3 383 1.1 mrg .proc __divsi3 384 1.1 mrg __divsi3: 385 1.1 mrg .regstk 2,0,0,0 386 1.1 mrg // Check divide by zero. 387 1.1 mrg cmp.ne.unc p0,p7=0,in1 388 1.1 mrg sxt4 in0 = in0 389 1.1 mrg sxt4 in1 = in1 390 1.1 mrg ;; 391 1.1 mrg setf.sig f8 = in0 392 1.1 mrg setf.sig f9 = in1 393 1.1 mrg (p7) break 1 394 1.1 mrg ;; 395 1.1 mrg mov r2 = 0x0ffdd 396 1.1 mrg fcvt.xf f8 = f8 397 1.1 mrg fcvt.xf f9 = f9 398 1.1 mrg ;; 399 1.1 mrg setf.exp f11 = r2 400 1.1 mrg frcpa.s1 f10, p6 = f8, f9 401 1.1 mrg ;; 402 1.1 mrg (p6) fmpy.s1 f8 = f8, f10 403 1.1 mrg (p6) fnma.s1 f9 = f9, f10, f1 404 1.1 mrg ;; 405 1.1 mrg (p6) fma.s1 f8 = f9, f8, f8 406 1.1 mrg (p6) fma.s1 f9 = f9, f9, f11 407 1.1 mrg ;; 408 1.1 mrg (p6) fma.s1 f10 = f9, f8, f8 409 1.1 mrg ;; 410 1.1 mrg fcvt.fx.trunc.s1 f10 = f10 411 1.1 mrg ;; 412 1.1 mrg getf.sig ret0 = f10 413 1.1 mrg br.ret.sptk rp 414 1.1 mrg ;; 415 1.1 mrg .endp __divsi3 416 1.1 mrg #endif 417 1.1 mrg 418 1.1 mrg #ifdef L__modsi3 419 1.1 mrg // Compute a 32-bit integer modulus. 420 1.1 mrg // 421 1.1 mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency 422 1.1 mrg // alternative. 423 1.1 mrg // 424 1.1 mrg // in0 holds the dividend. in1 holds the divisor. 425 1.1 mrg 426 1.1 mrg .text 427 1.1 mrg .align 16 428 1.1 mrg .global __modsi3 429 1.1 mrg .proc __modsi3 430 1.1 mrg __modsi3: 431 1.1 mrg .regstk 2,0,0,0 432 1.1 mrg mov r2 = 0x0ffdd 433 1.1 mrg sxt4 in0 = in0 434 1.1 mrg sxt4 in1 = in1 435 1.1 mrg ;; 436 1.1 mrg setf.sig f13 = r32 437 1.1 mrg setf.sig f9 = r33 438 1.1 mrg // Check divide by zero. 439 1.1 mrg cmp.ne.unc p0,p7=0,in1 440 1.1 mrg ;; 441 1.1 mrg sub in1 = r0, in1 442 1.1 mrg fcvt.xf f8 = f13 443 1.1 mrg fcvt.xf f9 = f9 444 1.1 mrg ;; 445 1.1 mrg setf.exp f11 = r2 446 1.1 mrg frcpa.s1 f10, p6 = f8, f9 447 1.1 mrg (p7) break 1 448 1.1 mrg ;; 449 1.1 mrg (p6) fmpy.s1 f12 = f8, f10 450 1.1 mrg (p6) fnma.s1 f10 = f9, f10, f1 451 1.1 mrg ;; 452 1.1 mrg setf.sig f9 = in1 453 1.1 mrg (p6) fma.s1 f12 = f10, f12, f12 454 1.1 mrg (p6) fma.s1 f10 = f10, f10, f11 455 1.1 mrg ;; 456 1.1 mrg (p6) fma.s1 f10 = f10, f12, f12 457 1.1 mrg ;; 458 1.1 mrg fcvt.fx.trunc.s1 f10 = f10 459 1.1 mrg ;; 460 1.1 mrg xma.l f10 = f10, f9, f13 461 1.1 mrg ;; 462 1.1 mrg getf.sig ret0 = f10 463 1.1 mrg br.ret.sptk rp 464 1.1 mrg ;; 465 1.1 mrg .endp __modsi3 466 1.1 mrg #endif 467 1.1 mrg 468 1.1 mrg #ifdef L__udivsi3 469 1.1 mrg // Compute a 32-bit unsigned integer quotient. 470 1.1 mrg // 471 1.1 mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency 472 1.1 mrg // alternative. 473 1.1 mrg // 474 1.1 mrg // in0 holds the dividend. in1 holds the divisor. 475 1.1 mrg 476 1.1 mrg .text 477 1.1 mrg .align 16 478 1.1 mrg .global __udivsi3 479 1.1 mrg .proc __udivsi3 480 1.1 mrg __udivsi3: 481 1.1 mrg .regstk 2,0,0,0 482 1.1 mrg mov r2 = 0x0ffdd 483 1.1 mrg zxt4 in0 = in0 484 1.1 mrg zxt4 in1 = in1 485 1.1 mrg ;; 486 1.1 mrg setf.sig f8 = in0 487 1.1 mrg setf.sig f9 = in1 488 1.1 mrg // Check divide by zero. 489 1.1 mrg cmp.ne.unc p0,p7=0,in1 490 1.1 mrg ;; 491 1.1 mrg fcvt.xf f8 = f8 492 1.1 mrg fcvt.xf f9 = f9 493 1.1 mrg (p7) break 1 494 1.1 mrg ;; 495 1.1 mrg setf.exp f11 = r2 496 1.1 mrg frcpa.s1 f10, p6 = f8, f9 497 1.1 mrg ;; 498 1.1 mrg (p6) fmpy.s1 f8 = f8, f10 499 1.1 mrg (p6) fnma.s1 f9 = f9, f10, f1 500 1.1 mrg ;; 501 1.1 mrg (p6) fma.s1 f8 = f9, f8, f8 502 1.1 mrg (p6) fma.s1 f9 = f9, f9, f11 503 1.1 mrg ;; 504 1.1 mrg (p6) fma.s1 f10 = f9, f8, f8 505 1.1 mrg ;; 506 1.1 mrg fcvt.fxu.trunc.s1 f10 = f10 507 1.1 mrg ;; 508 1.1 mrg getf.sig ret0 = f10 509 1.1 mrg br.ret.sptk rp 510 1.1 mrg ;; 511 1.1 mrg .endp __udivsi3 512 1.1 mrg #endif 513 1.1 mrg 514 1.1 mrg #ifdef L__umodsi3 515 1.1 mrg // Compute a 32-bit unsigned integer modulus. 516 1.1 mrg // 517 1.1 mrg // From the Intel IA-64 Optimization Guide, choose the minimum latency 518 1.1 mrg // alternative. 519 1.1 mrg // 520 1.1 mrg // in0 holds the dividend. in1 holds the divisor. 521 1.1 mrg 522 1.1 mrg .text 523 1.1 mrg .align 16 524 1.1 mrg .global __umodsi3 525 1.1 mrg .proc __umodsi3 526 1.1 mrg __umodsi3: 527 1.1 mrg .regstk 2,0,0,0 528 1.1 mrg mov r2 = 0x0ffdd 529 1.1 mrg zxt4 in0 = in0 530 1.1 mrg zxt4 in1 = in1 531 1.1 mrg ;; 532 1.1 mrg setf.sig f13 = in0 533 1.1 mrg setf.sig f9 = in1 534 1.1 mrg // Check divide by zero. 535 1.1 mrg cmp.ne.unc p0,p7=0,in1 536 1.1 mrg ;; 537 1.1 mrg sub in1 = r0, in1 538 1.1 mrg fcvt.xf f8 = f13 539 1.1 mrg fcvt.xf f9 = f9 540 1.1 mrg ;; 541 1.1 mrg setf.exp f11 = r2 542 1.1 mrg frcpa.s1 f10, p6 = f8, f9 543 1.1 mrg (p7) break 1; 544 1.1 mrg ;; 545 1.1 mrg (p6) fmpy.s1 f12 = f8, f10 546 1.1 mrg (p6) fnma.s1 f10 = f9, f10, f1 547 1.1 mrg ;; 548 1.1 mrg setf.sig f9 = in1 549 1.1 mrg (p6) fma.s1 f12 = f10, f12, f12 550 1.1 mrg (p6) fma.s1 f10 = f10, f10, f11 551 1.1 mrg ;; 552 1.1 mrg (p6) fma.s1 f10 = f10, f12, f12 553 1.1 mrg ;; 554 1.1 mrg fcvt.fxu.trunc.s1 f10 = f10 555 1.1 mrg ;; 556 1.1 mrg xma.l f10 = f10, f9, f13 557 1.1 mrg ;; 558 1.1 mrg getf.sig ret0 = f10 559 1.1 mrg br.ret.sptk rp 560 1.1 mrg ;; 561 1.1 mrg .endp __umodsi3 562 1.1 mrg #endif 563 1.1 mrg 564 1.1 mrg #ifdef L__save_stack_nonlocal 565 1.1 mrg // Notes on save/restore stack nonlocal: We read ar.bsp but write 566 1.1 mrg // ar.bspstore. This is because ar.bsp can be read at all times 567 1.1 mrg // (independent of the RSE mode) but since it's read-only we need to 568 1.1 mrg // restore the value via ar.bspstore. This is OK because 569 1.1 mrg // ar.bsp==ar.bspstore after executing "flushrs". 570 1.1 mrg 571 1.1 mrg // void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer) 572 1.1 mrg 573 1.1 mrg .text 574 1.1 mrg .align 16 575 1.1 mrg .global __ia64_save_stack_nonlocal 576 1.1 mrg .proc __ia64_save_stack_nonlocal 577 1.1 mrg __ia64_save_stack_nonlocal: 578 1.1 mrg { .mmf 579 1.1 mrg alloc r18 = ar.pfs, 2, 0, 0, 0 580 1.1 mrg mov r19 = ar.rsc 581 1.1 mrg ;; 582 1.1 mrg } 583 1.1 mrg { .mmi 584 1.1 mrg flushrs 585 1.1 mrg st8 [in0] = in1, 24 586 1.1 mrg and r19 = 0x1c, r19 587 1.1 mrg ;; 588 1.1 mrg } 589 1.1 mrg { .mmi 590 1.1 mrg st8 [in0] = r18, -16 591 1.1 mrg mov ar.rsc = r19 592 1.1 mrg or r19 = 0x3, r19 593 1.1 mrg ;; 594 1.1 mrg } 595 1.1 mrg { .mmi 596 1.1 mrg mov r16 = ar.bsp 597 1.1 mrg mov r17 = ar.rnat 598 1.1 mrg adds r2 = 8, in0 599 1.1 mrg ;; 600 1.1 mrg } 601 1.1 mrg { .mmi 602 1.1 mrg st8 [in0] = r16 603 1.1 mrg st8 [r2] = r17 604 1.1 mrg } 605 1.1 mrg { .mib 606 1.1 mrg mov ar.rsc = r19 607 1.1 mrg br.ret.sptk.few rp 608 1.1 mrg ;; 609 1.1 mrg } 610 1.1 mrg .endp __ia64_save_stack_nonlocal 611 1.1 mrg #endif 612 1.1 mrg 613 1.1 mrg #ifdef L__nonlocal_goto 614 1.1 mrg // void __ia64_nonlocal_goto(void *target_label, void *save_area, 615 1.1 mrg // void *static_chain); 616 1.1 mrg 617 1.1 mrg .text 618 1.1 mrg .align 16 619 1.1 mrg .global __ia64_nonlocal_goto 620 1.1 mrg .proc __ia64_nonlocal_goto 621 1.1 mrg __ia64_nonlocal_goto: 622 1.1 mrg { .mmi 623 1.1 mrg alloc r20 = ar.pfs, 3, 0, 0, 0 624 1.1 mrg ld8 r12 = [in1], 8 625 1.1 mrg mov.ret.sptk rp = in0, .L0 626 1.1 mrg ;; 627 1.1 mrg } 628 1.1 mrg { .mmf 629 1.1 mrg ld8 r16 = [in1], 8 630 1.1 mrg mov r19 = ar.rsc 631 1.1 mrg ;; 632 1.1 mrg } 633 1.1 mrg { .mmi 634 1.1 mrg flushrs 635 1.1 mrg ld8 r17 = [in1], 8 636 1.1 mrg and r19 = 0x1c, r19 637 1.1 mrg ;; 638 1.1 mrg } 639 1.1 mrg { .mmi 640 1.1 mrg ld8 r18 = [in1] 641 1.1 mrg mov ar.rsc = r19 642 1.1 mrg or r19 = 0x3, r19 643 1.1 mrg ;; 644 1.1 mrg } 645 1.1 mrg { .mmi 646 1.1 mrg mov ar.bspstore = r16 647 1.1 mrg ;; 648 1.1 mrg mov ar.rnat = r17 649 1.1 mrg ;; 650 1.1 mrg } 651 1.1 mrg { .mmi 652 1.1 mrg loadrs 653 1.1 mrg invala 654 1.1 mrg mov r15 = in2 655 1.1 mrg ;; 656 1.1 mrg } 657 1.1 mrg .L0: { .mib 658 1.1 mrg mov ar.rsc = r19 659 1.1 mrg mov ar.pfs = r18 660 1.1 mrg br.ret.sptk.few rp 661 1.1 mrg ;; 662 1.1 mrg } 663 1.1 mrg .endp __ia64_nonlocal_goto 664 1.1 mrg #endif 665 1.1 mrg 666 1.1 mrg #ifdef L__restore_stack_nonlocal 667 1.1 mrg // This is mostly the same as nonlocal_goto above. 668 1.1 mrg // ??? This has not been tested yet. 669 1.1 mrg 670 1.1 mrg // void __ia64_restore_stack_nonlocal(void *save_area) 671 1.1 mrg 672 1.1 mrg .text 673 1.1 mrg .align 16 674 1.1 mrg .global __ia64_restore_stack_nonlocal 675 1.1 mrg .proc __ia64_restore_stack_nonlocal 676 1.1 mrg __ia64_restore_stack_nonlocal: 677 1.1 mrg { .mmf 678 1.1 mrg alloc r20 = ar.pfs, 4, 0, 0, 0 679 1.1 mrg ld8 r12 = [in0], 8 680 1.1 mrg ;; 681 1.1 mrg } 682 1.1 mrg { .mmb 683 1.1 mrg ld8 r16=[in0], 8 684 1.1 mrg mov r19 = ar.rsc 685 1.1 mrg ;; 686 1.1 mrg } 687 1.1 mrg { .mmi 688 1.1 mrg flushrs 689 1.1 mrg ld8 r17 = [in0], 8 690 1.1 mrg and r19 = 0x1c, r19 691 1.1 mrg ;; 692 1.1 mrg } 693 1.1 mrg { .mmf 694 1.1 mrg ld8 r18 = [in0] 695 1.1 mrg mov ar.rsc = r19 696 1.1 mrg ;; 697 1.1 mrg } 698 1.1 mrg { .mmi 699 1.1 mrg mov ar.bspstore = r16 700 1.1 mrg ;; 701 1.1 mrg mov ar.rnat = r17 702 1.1 mrg or r19 = 0x3, r19 703 1.1 mrg ;; 704 1.1 mrg } 705 1.1 mrg { .mmf 706 1.1 mrg loadrs 707 1.1 mrg invala 708 1.1 mrg ;; 709 1.1 mrg } 710 1.1 mrg .L0: { .mib 711 1.1 mrg mov ar.rsc = r19 712 1.1 mrg mov ar.pfs = r18 713 1.1 mrg br.ret.sptk.few rp 714 1.1 mrg ;; 715 1.1 mrg } 716 1.1 mrg .endp __ia64_restore_stack_nonlocal 717 1.1 mrg #endif 718 1.1 mrg 719 1.1 mrg #ifdef L__trampoline 720 1.1 mrg // Implement the nested function trampoline. This is out of line 721 1.1 mrg // so that we don't have to bother with flushing the icache, as 722 1.1 mrg // well as making the on-stack trampoline smaller. 723 1.1 mrg // 724 1.1 mrg // The trampoline has the following form: 725 1.1 mrg // 726 1.1 mrg // +-------------------+ > 727 1.1 mrg // TRAMP: | __ia64_trampoline | | 728 1.1 mrg // +-------------------+ > fake function descriptor 729 1.1 mrg // | TRAMP+16 | | 730 1.1 mrg // +-------------------+ > 731 1.1 mrg // | target descriptor | 732 1.1 mrg // +-------------------+ 733 1.1 mrg // | static link | 734 1.1 mrg // +-------------------+ 735 1.1 mrg 736 1.1 mrg .text 737 1.1 mrg .align 16 738 1.1 mrg .global __ia64_trampoline 739 1.1 mrg .proc __ia64_trampoline 740 1.1 mrg __ia64_trampoline: 741 1.1 mrg { .mmi 742 1.1 mrg ld8 r2 = [r1], 8 743 1.1 mrg ;; 744 1.1 mrg ld8 r15 = [r1] 745 1.1 mrg } 746 1.1 mrg { .mmi 747 1.1 mrg ld8 r3 = [r2], 8 748 1.1 mrg ;; 749 1.1 mrg ld8 r1 = [r2] 750 1.1 mrg mov b6 = r3 751 1.1 mrg } 752 1.1 mrg { .bbb 753 1.1 mrg br.sptk.many b6 754 1.1 mrg ;; 755 1.1 mrg } 756 1.1 mrg .endp __ia64_trampoline 757 1.1 mrg #endif 758 1.1 mrg 759 1.1 mrg #ifdef SHARED 760 1.1 mrg // Thunks for backward compatibility. 761 1.1 mrg #ifdef L_fixtfdi 762 1.1 mrg .text 763 1.1 mrg .align 16 764 1.1 mrg .global __fixtfti 765 1.1 mrg .proc __fixtfti 766 1.1 mrg __fixtfti: 767 1.1 mrg { .bbb 768 1.1 mrg br.sptk.many __fixxfti 769 1.1 mrg ;; 770 1.1 mrg } 771 1.1 mrg .endp __fixtfti 772 1.1 mrg #endif 773 1.1 mrg #ifdef L_fixunstfdi 774 1.1 mrg .align 16 775 1.1 mrg .global __fixunstfti 776 1.1 mrg .proc __fixunstfti 777 1.1 mrg __fixunstfti: 778 1.1 mrg { .bbb 779 1.1 mrg br.sptk.many __fixunsxfti 780 1.1 mrg ;; 781 1.1 mrg } 782 1.1 mrg .endp __fixunstfti 783 1.1 mrg #endif 784 1.1 mrg #ifdef L_floatditf 785 1.1 mrg .align 16 786 1.1 mrg .global __floattitf 787 1.1 mrg .proc __floattitf 788 1.1 mrg __floattitf: 789 1.1 mrg { .bbb 790 1.1 mrg br.sptk.many __floattixf 791 1.1 mrg ;; 792 1.1 mrg } 793 1.1 mrg .endp __floattitf 794 1.1 mrg #endif 795 1.1 mrg #endif 796