1 1.1 mrg dnl IA-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add the 2 1.1 mrg dnl result to a second limb vector. 3 1.1 mrg 4 1.1.1.2 mrg dnl Contributed to the GNU project by Torbjorn Granlund. 5 1.1.1.2 mrg 6 1.1.1.3 mrg dnl Copyright 2000-2005, 2007 Free Software Foundation, Inc. 7 1.1 mrg 8 1.1 mrg dnl This file is part of the GNU MP Library. 9 1.1.1.3 mrg dnl 10 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 11 1.1.1.3 mrg dnl it under the terms of either: 12 1.1.1.3 mrg dnl 13 1.1.1.3 mrg dnl * the GNU Lesser General Public License as published by the Free 14 1.1.1.3 mrg dnl Software Foundation; either version 3 of the License, or (at your 15 1.1.1.3 mrg dnl option) any later version. 16 1.1.1.3 mrg dnl 17 1.1.1.3 mrg dnl or 18 1.1.1.3 mrg dnl 19 1.1.1.3 mrg dnl * the GNU General Public License as published by the Free Software 20 1.1.1.3 mrg dnl Foundation; either version 2 of the License, or (at your option) any 21 1.1.1.3 mrg dnl later version. 22 1.1.1.3 mrg dnl 23 1.1.1.3 mrg dnl or both in parallel, as here. 24 1.1.1.3 mrg dnl 25 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 26 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27 1.1.1.3 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28 1.1.1.3 mrg dnl for more details. 29 1.1.1.3 mrg dnl 30 1.1.1.3 mrg dnl You should have received copies of the GNU General Public License and the 31 1.1.1.3 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 32 1.1.1.3 mrg dnl see https://www.gnu.org/licenses/. 33 1.1 mrg 34 1.1 mrg include(`../config.m4') 35 1.1 mrg 36 1.1 mrg C cycles/limb 37 1.1 mrg C Itanium: 3.0 38 1.1 mrg C Itanium 2: 2.0 39 1.1 mrg 40 1.1 mrg C TODO 41 1.1 mrg C * Further optimize feed-in and wind-down code, both for speed and code size. 42 1.1 mrg C * Handle low limb input and results specially, using a common stf8 in the 43 1.1 mrg C epilogue. 44 1.1 mrg C * Use 1 c/l carry propagation scheme in wind-down code. 45 1.1 mrg C * Use extra pointer registers for `up' and rp to speed up feed-in loads. 46 1.1 mrg C * Work out final differences with mul_1.asm. That function is 300 bytes 47 1.1 mrg C smaller than this due to better loop scheduling and thus simpler feed-in 48 1.1 mrg C code. 49 1.1 mrg 50 1.1 mrg C INPUT PARAMETERS 51 1.1 mrg define(`rp', `r32') 52 1.1 mrg define(`up', `r33') 53 1.1 mrg define(`n', `r34') 54 1.1 mrg define(`vl', `r35') 55 1.1 mrg 56 1.1 mrg ASM_START() 57 1.1 mrg PROLOGUE(mpn_addmul_1) 58 1.1 mrg .prologue 59 1.1 mrg .save ar.lc, r2 60 1.1 mrg .body 61 1.1 mrg 62 1.1 mrg ifdef(`HAVE_ABI_32', 63 1.1 mrg ` addp4 rp = 0, rp C M I 64 1.1 mrg addp4 up = 0, up C M I 65 1.1 mrg zxt4 n = n C I 66 1.1 mrg ;; 67 1.1 mrg ') 68 1.1 mrg {.mmi 69 1.1 mrg adds r15 = -1, n C M I 70 1.1 mrg mov r20 = rp C M I 71 1.1 mrg mov.i r2 = ar.lc C I0 72 1.1 mrg } 73 1.1 mrg {.mmi 74 1.1 mrg ldf8 f7 = [up], 8 C M 75 1.1 mrg ldf8 f8 = [rp], 8 C M 76 1.1 mrg and r14 = 3, n C M I 77 1.1 mrg ;; 78 1.1 mrg } 79 1.1 mrg {.mmi 80 1.1 mrg setf.sig f6 = vl C M2 M3 81 1.1 mrg cmp.eq p10, p0 = 0, r14 C M I 82 1.1 mrg shr.u r31 = r15, 2 C I0 83 1.1 mrg } 84 1.1 mrg {.mmi 85 1.1 mrg cmp.eq p11, p0 = 2, r14 C M I 86 1.1 mrg cmp.eq p12, p0 = 3, r14 C M I 87 1.1 mrg nop.i 0 C I 88 1.1 mrg ;; 89 1.1 mrg } 90 1.1 mrg {.mii 91 1.1 mrg cmp.ne p6, p7 = r0, r0 C M I 92 1.1 mrg mov.i ar.lc = r31 C I0 93 1.1 mrg cmp.ne p8, p9 = r0, r0 C M I 94 1.1 mrg } 95 1.1 mrg {.bbb 96 1.1 mrg (p10) br.dptk .Lb00 C B 97 1.1 mrg (p11) br.dptk .Lb10 C B 98 1.1 mrg (p12) br.dptk .Lb11 C B 99 1.1 mrg ;; 100 1.1 mrg } 101 1.1 mrg 102 1.1 mrg .Lb01: br.cloop.dptk .grt1 C B 103 1.1 mrg 104 1.1 mrg xma.l f39 = f7, f6, f8 C F 105 1.1 mrg xma.hu f43 = f7, f6, f8 C F 106 1.1 mrg ;; 107 1.1 mrg getf.sig r8 = f43 C M2 108 1.1 mrg stf8 [r20] = f39 C M2 M3 109 1.1 mrg mov.i ar.lc = r2 C I0 110 1.1 mrg br.ret.sptk.many b0 C B 111 1.1 mrg 112 1.1 mrg .grt1: 113 1.1 mrg ldf8 f32 = [up], 8 114 1.1 mrg ldf8 f44 = [rp], 8 115 1.1 mrg ;; 116 1.1 mrg ldf8 f33 = [up], 8 117 1.1 mrg ldf8 f45 = [rp], 8 118 1.1 mrg ;; 119 1.1 mrg ldf8 f34 = [up], 8 120 1.1 mrg xma.l f39 = f7, f6, f8 121 1.1 mrg ldf8 f46 = [rp], 8 122 1.1 mrg xma.hu f43 = f7, f6, f8 123 1.1 mrg ;; 124 1.1 mrg ldf8 f35 = [up], 8 125 1.1 mrg ldf8 f47 = [rp], 8 126 1.1 mrg br.cloop.dptk .grt5 127 1.1 mrg 128 1.1 mrg xma.l f36 = f32, f6, f44 129 1.1 mrg xma.hu f40 = f32, f6, f44 130 1.1 mrg ;; 131 1.1 mrg stf8 [r20] = f39, 8 132 1.1 mrg xma.l f37 = f33, f6, f45 133 1.1 mrg xma.hu f41 = f33, f6, f45 134 1.1 mrg ;; 135 1.1 mrg getf.sig r31 = f43 136 1.1 mrg getf.sig r24 = f36 137 1.1 mrg xma.l f38 = f34, f6, f46 138 1.1 mrg xma.hu f42 = f34, f6, f46 139 1.1 mrg ;; 140 1.1 mrg getf.sig r28 = f40 141 1.1 mrg getf.sig r25 = f37 142 1.1 mrg xma.l f39 = f35, f6, f47 143 1.1 mrg xma.hu f43 = f35, f6, f47 144 1.1 mrg ;; 145 1.1 mrg getf.sig r29 = f41 146 1.1 mrg getf.sig r26 = f38 147 1.1 mrg br .Lcj5 148 1.1 mrg 149 1.1 mrg .grt5: 150 1.1 mrg mov r30 = 0 151 1.1 mrg xma.l f36 = f32, f6, f44 152 1.1 mrg xma.hu f40 = f32, f6, f44 153 1.1 mrg ;; 154 1.1 mrg ldf8 f32 = [up], 8 155 1.1 mrg xma.l f37 = f33, f6, f45 156 1.1 mrg ldf8 f44 = [rp], 8 157 1.1 mrg xma.hu f41 = f33, f6, f45 158 1.1 mrg ;; 159 1.1 mrg ldf8 f33 = [up], 8 160 1.1 mrg getf.sig r27 = f39 161 1.1 mrg ;; 162 1.1 mrg getf.sig r31 = f43 163 1.1 mrg xma.l f38 = f34, f6, f46 164 1.1 mrg ldf8 f45 = [rp], 8 165 1.1 mrg xma.hu f42 = f34, f6, f46 166 1.1 mrg ;; 167 1.1 mrg ldf8 f34 = [up], 8 168 1.1 mrg getf.sig r24 = f36 169 1.1 mrg ;; 170 1.1 mrg getf.sig r28 = f40 171 1.1 mrg xma.l f39 = f35, f6, f47 172 1.1 mrg ldf8 f46 = [rp], 8 173 1.1 mrg xma.hu f43 = f35, f6, f47 174 1.1 mrg ;; 175 1.1 mrg ldf8 f35 = [up], 8 176 1.1 mrg getf.sig r25 = f37 177 1.1 mrg br.cloop.dptk .Loop 178 1.1 mrg br .Le0 179 1.1 mrg 180 1.1 mrg 181 1.1 mrg .Lb10: ldf8 f35 = [up], 8 182 1.1 mrg ldf8 f47 = [rp], 8 183 1.1 mrg br.cloop.dptk .grt2 184 1.1 mrg 185 1.1 mrg xma.l f38 = f7, f6, f8 186 1.1 mrg xma.hu f42 = f7, f6, f8 187 1.1 mrg ;; 188 1.1 mrg xma.l f39 = f35, f6, f47 189 1.1 mrg xma.hu f43 = f35, f6, f47 190 1.1 mrg ;; 191 1.1 mrg getf.sig r30 = f42 192 1.1 mrg stf8 [r20] = f38, 8 193 1.1 mrg getf.sig r27 = f39 194 1.1 mrg getf.sig r8 = f43 195 1.1 mrg br .Lcj2 196 1.1 mrg 197 1.1 mrg .grt2: 198 1.1 mrg ldf8 f32 = [up], 8 199 1.1 mrg ldf8 f44 = [rp], 8 200 1.1 mrg ;; 201 1.1 mrg ldf8 f33 = [up], 8 202 1.1 mrg xma.l f38 = f7, f6, f8 203 1.1 mrg ldf8 f45 = [rp], 8 204 1.1 mrg xma.hu f42 = f7, f6, f8 205 1.1 mrg ;; 206 1.1 mrg ldf8 f34 = [up], 8 207 1.1 mrg xma.l f39 = f35, f6, f47 208 1.1 mrg ldf8 f46 = [rp], 8 209 1.1 mrg xma.hu f43 = f35, f6, f47 210 1.1 mrg ;; 211 1.1 mrg ldf8 f35 = [up], 8 212 1.1 mrg ldf8 f47 = [rp], 8 213 1.1 mrg br.cloop.dptk .grt6 214 1.1 mrg 215 1.1 mrg stf8 [r20] = f38, 8 216 1.1 mrg xma.l f36 = f32, f6, f44 217 1.1 mrg xma.hu f40 = f32, f6, f44 218 1.1 mrg ;; 219 1.1 mrg getf.sig r30 = f42 220 1.1 mrg getf.sig r27 = f39 221 1.1 mrg xma.l f37 = f33, f6, f45 222 1.1 mrg xma.hu f41 = f33, f6, f45 223 1.1 mrg ;; 224 1.1 mrg getf.sig r31 = f43 225 1.1 mrg getf.sig r24 = f36 226 1.1 mrg xma.l f38 = f34, f6, f46 227 1.1 mrg xma.hu f42 = f34, f6, f46 228 1.1 mrg ;; 229 1.1 mrg getf.sig r28 = f40 230 1.1 mrg getf.sig r25 = f37 231 1.1 mrg xma.l f39 = f35, f6, f47 232 1.1 mrg xma.hu f43 = f35, f6, f47 233 1.1 mrg br .Lcj6 234 1.1 mrg 235 1.1 mrg .grt6: 236 1.1 mrg mov r29 = 0 237 1.1 mrg xma.l f36 = f32, f6, f44 238 1.1 mrg xma.hu f40 = f32, f6, f44 239 1.1 mrg ;; 240 1.1 mrg ldf8 f32 = [up], 8 241 1.1 mrg getf.sig r26 = f38 242 1.1 mrg ;; 243 1.1 mrg getf.sig r30 = f42 244 1.1 mrg xma.l f37 = f33, f6, f45 245 1.1 mrg ldf8 f44 = [rp], 8 246 1.1 mrg xma.hu f41 = f33, f6, f45 247 1.1 mrg ;; 248 1.1 mrg ldf8 f33 = [up], 8 249 1.1 mrg getf.sig r27 = f39 250 1.1 mrg ;; 251 1.1 mrg getf.sig r31 = f43 252 1.1 mrg xma.l f38 = f34, f6, f46 253 1.1 mrg ldf8 f45 = [rp], 8 254 1.1 mrg xma.hu f42 = f34, f6, f46 255 1.1 mrg ;; 256 1.1 mrg ldf8 f34 = [up], 8 257 1.1 mrg getf.sig r24 = f36 258 1.1 mrg br .LL10 259 1.1 mrg 260 1.1 mrg 261 1.1 mrg .Lb11: ldf8 f34 = [up], 8 262 1.1 mrg ldf8 f46 = [rp], 8 263 1.1 mrg ;; 264 1.1 mrg ldf8 f35 = [up], 8 265 1.1 mrg ldf8 f47 = [rp], 8 266 1.1 mrg br.cloop.dptk .grt3 267 1.1 mrg ;; 268 1.1 mrg 269 1.1 mrg xma.l f37 = f7, f6, f8 270 1.1 mrg xma.hu f41 = f7, f6, f8 271 1.1 mrg xma.l f38 = f34, f6, f46 272 1.1 mrg xma.hu f42 = f34, f6, f46 273 1.1 mrg xma.l f39 = f35, f6, f47 274 1.1 mrg xma.hu f43 = f35, f6, f47 275 1.1 mrg ;; 276 1.1 mrg getf.sig r29 = f41 277 1.1 mrg stf8 [r20] = f37, 8 278 1.1 mrg getf.sig r26 = f38 279 1.1 mrg getf.sig r30 = f42 280 1.1 mrg getf.sig r27 = f39 281 1.1 mrg getf.sig r8 = f43 282 1.1 mrg br .Lcj3 283 1.1 mrg 284 1.1 mrg .grt3: 285 1.1 mrg ldf8 f32 = [up], 8 286 1.1 mrg xma.l f37 = f7, f6, f8 287 1.1 mrg ldf8 f44 = [rp], 8 288 1.1 mrg xma.hu f41 = f7, f6, f8 289 1.1 mrg ;; 290 1.1 mrg ldf8 f33 = [up], 8 291 1.1 mrg xma.l f38 = f34, f6, f46 292 1.1 mrg ldf8 f45 = [rp], 8 293 1.1 mrg xma.hu f42 = f34, f6, f46 294 1.1 mrg ;; 295 1.1 mrg ldf8 f34 = [up], 8 296 1.1 mrg xma.l f39 = f35, f6, f47 297 1.1 mrg ldf8 f46 = [rp], 8 298 1.1 mrg xma.hu f43 = f35, f6, f47 299 1.1 mrg ;; 300 1.1 mrg ldf8 f35 = [up], 8 301 1.1 mrg getf.sig r25 = f37 C FIXME 302 1.1 mrg ldf8 f47 = [rp], 8 303 1.1 mrg br.cloop.dptk .grt7 304 1.1 mrg 305 1.1 mrg getf.sig r29 = f41 306 1.1 mrg stf8 [r20] = f37, 8 C FIXME 307 1.1 mrg xma.l f36 = f32, f6, f44 308 1.1 mrg getf.sig r26 = f38 309 1.1 mrg xma.hu f40 = f32, f6, f44 310 1.1 mrg ;; 311 1.1 mrg getf.sig r30 = f42 312 1.1 mrg xma.l f37 = f33, f6, f45 313 1.1 mrg getf.sig r27 = f39 314 1.1 mrg xma.hu f41 = f33, f6, f45 315 1.1 mrg ;; 316 1.1 mrg getf.sig r31 = f43 317 1.1 mrg xma.l f38 = f34, f6, f46 318 1.1 mrg getf.sig r24 = f36 319 1.1 mrg xma.hu f42 = f34, f6, f46 320 1.1 mrg br .Lcj7 321 1.1 mrg 322 1.1 mrg .grt7: 323 1.1 mrg getf.sig r29 = f41 324 1.1 mrg xma.l f36 = f32, f6, f44 325 1.1 mrg mov r28 = 0 326 1.1 mrg xma.hu f40 = f32, f6, f44 327 1.1 mrg ;; 328 1.1 mrg ldf8 f32 = [up], 8 329 1.1 mrg getf.sig r26 = f38 330 1.1 mrg ;; 331 1.1 mrg getf.sig r30 = f42 332 1.1 mrg xma.l f37 = f33, f6, f45 333 1.1 mrg ldf8 f44 = [rp], 8 334 1.1 mrg xma.hu f41 = f33, f6, f45 335 1.1 mrg ;; 336 1.1 mrg ldf8 f33 = [up], 8 337 1.1 mrg getf.sig r27 = f39 338 1.1 mrg br .LL11 339 1.1 mrg 340 1.1 mrg 341 1.1 mrg .Lb00: ldf8 f33 = [up], 8 342 1.1 mrg ldf8 f45 = [rp], 8 343 1.1 mrg ;; 344 1.1 mrg ldf8 f34 = [up], 8 345 1.1 mrg ldf8 f46 = [rp], 8 346 1.1 mrg ;; 347 1.1 mrg ldf8 f35 = [up], 8 348 1.1 mrg xma.l f36 = f7, f6, f8 349 1.1 mrg ldf8 f47 = [rp], 8 350 1.1 mrg xma.hu f40 = f7, f6, f8 351 1.1 mrg br.cloop.dptk .grt4 352 1.1 mrg 353 1.1 mrg xma.l f37 = f33, f6, f45 354 1.1 mrg xma.hu f41 = f33, f6, f45 355 1.1 mrg xma.l f38 = f34, f6, f46 356 1.1 mrg xma.hu f42 = f34, f6, f46 357 1.1 mrg ;; 358 1.1 mrg getf.sig r28 = f40 359 1.1 mrg stf8 [r20] = f36, 8 360 1.1 mrg xma.l f39 = f35, f6, f47 361 1.1 mrg getf.sig r25 = f37 362 1.1 mrg xma.hu f43 = f35, f6, f47 363 1.1 mrg ;; 364 1.1 mrg getf.sig r29 = f41 365 1.1 mrg getf.sig r26 = f38 366 1.1 mrg getf.sig r30 = f42 367 1.1 mrg getf.sig r27 = f39 368 1.1 mrg br .Lcj4 369 1.1 mrg 370 1.1 mrg .grt4: 371 1.1 mrg ldf8 f32 = [up], 8 372 1.1 mrg xma.l f37 = f33, f6, f45 373 1.1 mrg ldf8 f44 = [rp], 8 374 1.1 mrg xma.hu f41 = f33, f6, f45 375 1.1 mrg ;; 376 1.1 mrg ldf8 f33 = [up], 8 377 1.1 mrg xma.l f38 = f34, f6, f46 378 1.1 mrg ldf8 f45 = [rp], 8 379 1.1 mrg xma.hu f42 = f34, f6, f46 380 1.1 mrg ;; 381 1.1 mrg ldf8 f34 = [up], 8 382 1.1 mrg getf.sig r24 = f36 C FIXME 383 1.1 mrg xma.l f39 = f35, f6, f47 384 1.1 mrg ldf8 f46 = [rp], 8 385 1.1 mrg getf.sig r28 = f40 386 1.1 mrg xma.hu f43 = f35, f6, f47 387 1.1 mrg ;; 388 1.1 mrg ldf8 f35 = [up], 8 389 1.1 mrg getf.sig r25 = f37 390 1.1 mrg ldf8 f47 = [rp], 8 391 1.1 mrg br.cloop.dptk .grt8 392 1.1 mrg 393 1.1 mrg getf.sig r29 = f41 394 1.1 mrg stf8 [r20] = f36, 8 C FIXME 395 1.1 mrg xma.l f36 = f32, f6, f44 396 1.1 mrg getf.sig r26 = f38 397 1.1 mrg getf.sig r30 = f42 398 1.1 mrg xma.hu f40 = f32, f6, f44 399 1.1 mrg ;; 400 1.1 mrg xma.l f37 = f33, f6, f45 401 1.1 mrg getf.sig r27 = f39 402 1.1 mrg xma.hu f41 = f33, f6, f45 403 1.1 mrg br .Lcj8 404 1.1 mrg 405 1.1 mrg .grt8: 406 1.1 mrg getf.sig r29 = f41 407 1.1 mrg xma.l f36 = f32, f6, f44 408 1.1 mrg mov r31 = 0 409 1.1 mrg xma.hu f40 = f32, f6, f44 410 1.1 mrg ;; 411 1.1 mrg ldf8 f32 = [up], 8 412 1.1 mrg getf.sig r26 = f38 413 1.1 mrg br .LL00 414 1.1 mrg 415 1.1 mrg 416 1.1 mrg C *** MAIN LOOP START *** 417 1.1 mrg ALIGN(32) C insn fed cycle # 418 1.1 mrg .Loop: 419 1.1 mrg .pred.rel "mutex", p6, p7 C num by i1 i2 420 1.1 mrg getf.sig r29 = f41 C 00 16 0 0 421 1.1 mrg xma.l f36 = f32, f6, f44 C 01 06,15 0 0 422 1.1 mrg (p6) add r14 = r30, r27, 1 C 02 0 0 423 1.1 mrg ldf8 f47 = [rp], 8 C 03 0 0 424 1.1 mrg xma.hu f40 = f32, f6, f44 C 04 06,15 0 0 425 1.1 mrg (p7) add r14 = r30, r27 C 05 0 0 426 1.1 mrg ;; 427 1.1 mrg .pred.rel "mutex", p6, p7 428 1.1 mrg ldf8 f32 = [up], 8 C 06 1 1 429 1.1 mrg (p6) cmp.leu p8, p9 = r14, r27 C 07 1 1 430 1.1 mrg (p7) cmp.ltu p8, p9 = r14, r27 C 08 1 1 431 1.1 mrg getf.sig r26 = f38 C 09 25 2 1 432 1.1 mrg st8 [r20] = r14, 8 C 10 2 1 433 1.1 mrg nop.b 0 C 11 2 1 434 1.1 mrg ;; 435 1.1 mrg .LL00: 436 1.1 mrg .pred.rel "mutex", p8, p9 437 1.1 mrg getf.sig r30 = f42 C 12 28 3 2 438 1.1 mrg xma.l f37 = f33, f6, f45 C 13 18,27 3 2 439 1.1 mrg (p8) add r16 = r31, r24, 1 C 14 3 2 440 1.1 mrg ldf8 f44 = [rp], 8 C 15 3 2 441 1.1 mrg xma.hu f41 = f33, f6, f45 C 16 18,27 3 2 442 1.1 mrg (p9) add r16 = r31, r24 C 17 3 2 443 1.1 mrg ;; 444 1.1 mrg .pred.rel "mutex", p8, p9 445 1.1 mrg ldf8 f33 = [up], 8 C 18 4 3 446 1.1 mrg (p8) cmp.leu p6, p7 = r16, r24 C 19 4 3 447 1.1 mrg (p9) cmp.ltu p6, p7 = r16, r24 C 20 4 3 448 1.1 mrg getf.sig r27 = f39 C 21 37 5 3 449 1.1 mrg st8 [r20] = r16, 8 C 22 5 3 450 1.1 mrg nop.b 0 C 23 5 3 451 1.1 mrg ;; 452 1.1 mrg .LL11: 453 1.1 mrg .pred.rel "mutex", p6, p7 454 1.1 mrg getf.sig r31 = f43 C 24 40 6 4 455 1.1 mrg xma.l f38 = f34, f6, f46 C 25 30,39 6 4 456 1.1 mrg (p6) add r14 = r28, r25, 1 C 26 6 4 457 1.1 mrg ldf8 f45 = [rp], 8 C 27 6 4 458 1.1 mrg xma.hu f42 = f34, f6, f46 C 28 30,39 6 4 459 1.1 mrg (p7) add r14 = r28, r25 C 29 6 4 460 1.1 mrg ;; 461 1.1 mrg .pred.rel "mutex", p6, p7 462 1.1 mrg ldf8 f34 = [up], 8 C 30 7 5 463 1.1 mrg (p6) cmp.leu p8, p9 = r14, r25 C 31 7 5 464 1.1 mrg (p7) cmp.ltu p8, p9 = r14, r25 C 32 7 5 465 1.1 mrg getf.sig r24 = f36 C 33 01 8 5 466 1.1 mrg st8 [r20] = r14, 8 C 34 8 5 467 1.1 mrg nop.b 0 C 35 8 5 468 1.1 mrg ;; 469 1.1 mrg .LL10: 470 1.1 mrg .pred.rel "mutex", p8, p9 471 1.1 mrg getf.sig r28 = f40 C 36 04 9 6 472 1.1 mrg xma.l f39 = f35, f6, f47 C 37 42,03 9 6 473 1.1 mrg (p8) add r16 = r29, r26, 1 C 38 9 6 474 1.1 mrg ldf8 f46 = [rp], 8 C 39 9 6 475 1.1 mrg xma.hu f43 = f35, f6, f47 C 40 42,03 9 6 476 1.1 mrg (p9) add r16 = r29, r26 C 41 9 6 477 1.1 mrg ;; 478 1.1 mrg .pred.rel "mutex", p8, p9 479 1.1 mrg ldf8 f35 = [up], 8 C 42 10 7 480 1.1 mrg (p8) cmp.leu p6, p7 = r16, r26 C 43 10 7 481 1.1 mrg (p9) cmp.ltu p6, p7 = r16, r26 C 44 10 7 482 1.1 mrg getf.sig r25 = f37 C 45 13 11 7 483 1.1 mrg st8 [r20] = r16, 8 C 46 11 7 484 1.1 mrg br.cloop.dptk .Loop C 47 11 7 485 1.1 mrg C *** MAIN LOOP END *** 486 1.1 mrg ;; 487 1.1 mrg .Le0: 488 1.1 mrg .pred.rel "mutex", p6, p7 489 1.1 mrg getf.sig r29 = f41 C 490 1.1 mrg xma.l f36 = f32, f6, f44 C 491 1.1 mrg (p6) add r14 = r30, r27, 1 C 492 1.1 mrg ldf8 f47 = [rp], 8 C 493 1.1 mrg xma.hu f40 = f32, f6, f44 C 494 1.1 mrg (p7) add r14 = r30, r27 C 495 1.1 mrg ;; 496 1.1 mrg .pred.rel "mutex", p6, p7 497 1.1 mrg (p6) cmp.leu p8, p9 = r14, r27 C 498 1.1 mrg (p7) cmp.ltu p8, p9 = r14, r27 C 499 1.1 mrg getf.sig r26 = f38 C 500 1.1 mrg st8 [r20] = r14, 8 C 501 1.1 mrg ;; 502 1.1 mrg .pred.rel "mutex", p8, p9 503 1.1 mrg getf.sig r30 = f42 C 504 1.1 mrg xma.l f37 = f33, f6, f45 C 505 1.1 mrg (p8) add r16 = r31, r24, 1 C 506 1.1 mrg xma.hu f41 = f33, f6, f45 C 507 1.1 mrg (p9) add r16 = r31, r24 C 508 1.1 mrg ;; 509 1.1 mrg .pred.rel "mutex", p8, p9 510 1.1 mrg (p8) cmp.leu p6, p7 = r16, r24 C 511 1.1 mrg (p9) cmp.ltu p6, p7 = r16, r24 C 512 1.1 mrg getf.sig r27 = f39 C 513 1.1 mrg st8 [r20] = r16, 8 C 514 1.1 mrg ;; 515 1.1 mrg .Lcj8: 516 1.1 mrg .pred.rel "mutex", p6, p7 517 1.1 mrg getf.sig r31 = f43 C 518 1.1 mrg xma.l f38 = f34, f6, f46 C 519 1.1 mrg (p6) add r14 = r28, r25, 1 C 520 1.1 mrg xma.hu f42 = f34, f6, f46 C 521 1.1 mrg (p7) add r14 = r28, r25 C 522 1.1 mrg ;; 523 1.1 mrg .pred.rel "mutex", p6, p7 524 1.1 mrg (p6) cmp.leu p8, p9 = r14, r25 C 525 1.1 mrg (p7) cmp.ltu p8, p9 = r14, r25 C 526 1.1 mrg getf.sig r24 = f36 C 527 1.1 mrg st8 [r20] = r14, 8 C 528 1.1 mrg ;; 529 1.1 mrg .Lcj7: 530 1.1 mrg .pred.rel "mutex", p8, p9 531 1.1 mrg getf.sig r28 = f40 C 532 1.1 mrg xma.l f39 = f35, f6, f47 C 533 1.1 mrg (p8) add r16 = r29, r26, 1 C 534 1.1 mrg xma.hu f43 = f35, f6, f47 C 535 1.1 mrg (p9) add r16 = r29, r26 C 536 1.1 mrg ;; 537 1.1 mrg .pred.rel "mutex", p8, p9 538 1.1 mrg (p8) cmp.leu p6, p7 = r16, r26 C 539 1.1 mrg (p9) cmp.ltu p6, p7 = r16, r26 C 540 1.1 mrg getf.sig r25 = f37 C 541 1.1 mrg st8 [r20] = r16, 8 C 542 1.1 mrg ;; 543 1.1 mrg .Lcj6: 544 1.1 mrg .pred.rel "mutex", p6, p7 545 1.1 mrg getf.sig r29 = f41 C 546 1.1 mrg (p6) add r14 = r30, r27, 1 C 547 1.1 mrg (p7) add r14 = r30, r27 C 548 1.1 mrg ;; 549 1.1 mrg .pred.rel "mutex", p6, p7 550 1.1 mrg (p6) cmp.leu p8, p9 = r14, r27 C 551 1.1 mrg (p7) cmp.ltu p8, p9 = r14, r27 C 552 1.1 mrg getf.sig r26 = f38 C 553 1.1 mrg st8 [r20] = r14, 8 C 554 1.1 mrg ;; 555 1.1 mrg .Lcj5: 556 1.1 mrg .pred.rel "mutex", p8, p9 557 1.1 mrg getf.sig r30 = f42 C 558 1.1 mrg (p8) add r16 = r31, r24, 1 C 559 1.1 mrg (p9) add r16 = r31, r24 C 560 1.1 mrg ;; 561 1.1 mrg .pred.rel "mutex", p8, p9 562 1.1 mrg (p8) cmp.leu p6, p7 = r16, r24 C 563 1.1 mrg (p9) cmp.ltu p6, p7 = r16, r24 C 564 1.1 mrg getf.sig r27 = f39 C 565 1.1 mrg st8 [r20] = r16, 8 C 566 1.1 mrg ;; 567 1.1 mrg .Lcj4: 568 1.1 mrg .pred.rel "mutex", p6, p7 569 1.1 mrg getf.sig r8 = f43 C 570 1.1 mrg (p6) add r14 = r28, r25, 1 C 571 1.1 mrg (p7) add r14 = r28, r25 C 572 1.1 mrg ;; 573 1.1 mrg .pred.rel "mutex", p6, p7 574 1.1 mrg st8 [r20] = r14, 8 C 575 1.1 mrg (p6) cmp.leu p8, p9 = r14, r25 C 576 1.1 mrg (p7) cmp.ltu p8, p9 = r14, r25 C 577 1.1 mrg ;; 578 1.1 mrg .Lcj3: 579 1.1 mrg .pred.rel "mutex", p8, p9 580 1.1 mrg (p8) add r16 = r29, r26, 1 C 581 1.1 mrg (p9) add r16 = r29, r26 C 582 1.1 mrg ;; 583 1.1 mrg .pred.rel "mutex", p8, p9 584 1.1 mrg st8 [r20] = r16, 8 C 585 1.1 mrg (p8) cmp.leu p6, p7 = r16, r26 C 586 1.1 mrg (p9) cmp.ltu p6, p7 = r16, r26 C 587 1.1 mrg ;; 588 1.1 mrg .Lcj2: 589 1.1 mrg .pred.rel "mutex", p6, p7 590 1.1 mrg (p6) add r14 = r30, r27, 1 C 591 1.1 mrg (p7) add r14 = r30, r27 C 592 1.1 mrg ;; 593 1.1 mrg .pred.rel "mutex", p6, p7 594 1.1 mrg st8 [r20] = r14 C 595 1.1 mrg (p6) cmp.leu p8, p9 = r14, r27 C 596 1.1 mrg (p7) cmp.ltu p8, p9 = r14, r27 C 597 1.1 mrg ;; 598 1.1 mrg (p8) add r8 = 1, r8 C M I 599 1.1 mrg mov.i ar.lc = r2 C I0 600 1.1 mrg br.ret.sptk.many b0 C B 601 1.1 mrg EPILOGUE() 602 1.1 mrg ASM_END() 603