1 1.1 mrg dnl IA-64 mpn_mul_2 -- Multiply a n-limb number with a 2-limb number and store 2 1.1 mrg dnl store the result to a (n+1)-limb number. 3 1.1 mrg 4 1.1.1.2 mrg dnl Contributed to the GNU project by Torbjorn Granlund. 5 1.1.1.2 mrg 6 1.1.1.2 mrg dnl Copyright 2004, 2011 Free Software Foundation, Inc. 7 1.1 mrg 8 1.1 mrg dnl This file is part of the GNU MP Library. 9 1.1.1.3 mrg dnl 10 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 11 1.1.1.3 mrg dnl it under the terms of either: 12 1.1.1.3 mrg dnl 13 1.1.1.3 mrg dnl * the GNU Lesser General Public License as published by the Free 14 1.1.1.3 mrg dnl Software Foundation; either version 3 of the License, or (at your 15 1.1.1.3 mrg dnl option) any later version. 16 1.1.1.3 mrg dnl 17 1.1.1.3 mrg dnl or 18 1.1.1.3 mrg dnl 19 1.1.1.3 mrg dnl * the GNU General Public License as published by the Free Software 20 1.1.1.3 mrg dnl Foundation; either version 2 of the License, or (at your option) any 21 1.1.1.3 mrg dnl later version. 22 1.1.1.3 mrg dnl 23 1.1.1.3 mrg dnl or both in parallel, as here. 24 1.1.1.3 mrg dnl 25 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 26 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27 1.1.1.3 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28 1.1.1.3 mrg dnl for more details. 29 1.1.1.3 mrg dnl 30 1.1.1.3 mrg dnl You should have received copies of the GNU General Public License and the 31 1.1.1.3 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 32 1.1.1.3 mrg dnl see https://www.gnu.org/licenses/. 33 1.1 mrg 34 1.1 mrg include(`../config.m4') 35 1.1 mrg 36 1.1 mrg C cycles/limb 37 1.1.1.2 mrg C Itanium: ? 38 1.1.1.2 mrg C Itanium 2: 1.5 39 1.1 mrg 40 1.1 mrg C TODO 41 1.1 mrg C * Clean up variable names, and try to decrease the number of distinct 42 1.1 mrg C registers used. 43 1.1.1.2 mrg C * Clean up feed-in code to not require zeroing several registers. 44 1.1 mrg C * Make sure we don't depend on uninitialized predicate registers. 45 1.1 mrg C * Could perhaps save a few cycles by using 1 c/l carry propagation in 46 1.1 mrg C wind-down code. 47 1.1 mrg C * Ultimately rewrite. The problem with this code is that it first uses a 48 1.1 mrg C loaded u value in one xma pair, then leaves it live over several unrelated 49 1.1 mrg C xma pairs, before it uses it again. It should actually be quite possible 50 1.1 mrg C to just swap some aligned xma pairs around. But we should then schedule 51 1.1 mrg C u loads further from the first use. 52 1.1 mrg 53 1.1 mrg C INPUT PARAMETERS 54 1.1 mrg define(`rp',`r32') 55 1.1 mrg define(`up',`r33') 56 1.1 mrg define(`n',`r34') 57 1.1 mrg define(`vp',`r35') 58 1.1 mrg 59 1.1 mrg define(`srp',`r3') 60 1.1 mrg 61 1.1 mrg define(`v0',`f6') 62 1.1 mrg define(`v1',`f7') 63 1.1 mrg 64 1.1 mrg define(`s0',`r14') 65 1.1 mrg define(`acc0',`r15') 66 1.1 mrg 67 1.1 mrg define(`pr0_0',`r16') define(`pr0_1',`r17') 68 1.1 mrg define(`pr0_2',`r18') define(`pr0_3',`r19') 69 1.1 mrg 70 1.1 mrg define(`pr1_0',`r20') define(`pr1_1',`r21') 71 1.1 mrg define(`pr1_2',`r22') define(`pr1_3',`r23') 72 1.1 mrg 73 1.1 mrg define(`acc1_0',`r24') define(`acc1_1',`r25') 74 1.1 mrg define(`acc1_2',`r26') define(`acc1_3',`r27') 75 1.1 mrg 76 1.1 mrg dnl define(`',`r28') 77 1.1 mrg dnl define(`',`r29') 78 1.1 mrg dnl define(`',`r30') 79 1.1 mrg dnl define(`',`r31') 80 1.1 mrg 81 1.1 mrg define(`fp0b_0',`f8') define(`fp0b_1',`f9') 82 1.1 mrg define(`fp0b_2',`f10') define(`fp0b_3',`f11') 83 1.1 mrg 84 1.1 mrg define(`fp1a_0',`f12') define(`fp1a_1',`f13') 85 1.1 mrg define(`fp1a_2',`f14') define(`fp1a_3',`f15') 86 1.1 mrg 87 1.1 mrg define(`fp1b_0',`f32') define(`fp1b_1',`f33') 88 1.1 mrg define(`fp1b_2',`f34') define(`fp1b_3',`f35') 89 1.1 mrg 90 1.1 mrg define(`fp2a_0',`f36') define(`fp2a_1',`f37') 91 1.1 mrg define(`fp2a_2',`f38') define(`fp2a_3',`f39') 92 1.1 mrg 93 1.1 mrg define(`u_0',`f44') define(`u_1',`f45') 94 1.1 mrg define(`u_2',`f46') define(`u_3',`f47') 95 1.1 mrg 96 1.1 mrg define(`ux',`f49') 97 1.1 mrg define(`uy',`f51') 98 1.1 mrg 99 1.1 mrg ASM_START() 100 1.1 mrg PROLOGUE(mpn_mul_2) 101 1.1 mrg .prologue 102 1.1 mrg .save ar.lc, r2 103 1.1 mrg .body 104 1.1 mrg 105 1.1.1.2 mrg ifdef(`HAVE_ABI_32',` 106 1.1.1.3 mrg {.mmi; addp4 rp = 0, rp C M I 107 1.1.1.2 mrg addp4 up = 0, up C M I 108 1.1.1.2 mrg addp4 vp = 0, vp C M I 109 1.1.1.3 mrg }{.mmi; nop 1 110 1.1.1.2 mrg nop 1 111 1.1.1.2 mrg zxt4 n = n C I 112 1.1.1.3 mrg ;; 113 1.1.1.3 mrg }') 114 1.1 mrg 115 1.1.1.3 mrg {.mmi; ldf8 ux = [up], 8 C M 116 1.1.1.2 mrg ldf8 v0 = [vp], 8 C M 117 1.1.1.2 mrg mov r2 = ar.lc C I0 118 1.1.1.3 mrg }{.mmi; nop 1 C M 119 1.1.1.2 mrg and r14 = 3, n C M I 120 1.1.1.2 mrg add n = -2, n C M I 121 1.1.1.2 mrg ;; 122 1.1.1.3 mrg }{.mmi; ldf8 uy = [up], 8 C M 123 1.1.1.2 mrg ldf8 v1 = [vp] C M 124 1.1.1.3 mrg shr.u n = n, 2 C I0 125 1.1.1.3 mrg }{.mmi; nop 1 C M 126 1.1.1.2 mrg cmp.eq p10, p0 = 1, r14 C M I 127 1.1.1.2 mrg cmp.eq p11, p0 = 2, r14 C M I 128 1.1.1.2 mrg ;; 129 1.1.1.3 mrg }{.mmi; nop 1 C M 130 1.1.1.2 mrg cmp.eq p12, p0 = 3, r14 C M I 131 1.1.1.2 mrg mov ar.lc = n C I0 132 1.1.1.3 mrg }{.bbb; (p10) br.dptk L(b01) C B 133 1.1.1.2 mrg (p11) br.dptk L(b10) C B 134 1.1.1.2 mrg (p12) br.dptk L(b11) C B 135 1.1 mrg ;; 136 1.1.1.3 mrg } 137 1.1 mrg ALIGN(32) 138 1.1.1.2 mrg L(b00): ldf8 u_1 = [up], 8 139 1.1.1.2 mrg mov acc1_2 = 0 140 1.1.1.2 mrg mov pr1_2 = 0 141 1.1.1.2 mrg mov pr0_3 = 0 142 1.1.1.2 mrg cmp.ne p8, p9 = r0, r0 143 1.1.1.2 mrg ;; 144 1.1.1.2 mrg xma.l fp0b_3 = ux, v0, f0 145 1.1.1.2 mrg cmp.ne p12, p13 = r0, r0 146 1.1.1.2 mrg ldf8 u_2 = [up], 8 147 1.1.1.2 mrg xma.hu fp1a_3 = ux, v0, f0 148 1.1.1.2 mrg br.cloop.dptk L(gt4) 149 1.1.1.2 mrg 150 1.1.1.2 mrg xma.l fp0b_0 = uy, v0, f0 151 1.1.1.2 mrg xma.hu fp1a_0 = uy, v0, f0 152 1.1.1.2 mrg ;; 153 1.1.1.2 mrg getfsig acc0 = fp0b_3 154 1.1.1.2 mrg xma.l fp1b_3 = ux, v1, fp1a_3 155 1.1.1.2 mrg xma.hu fp2a_3 = ux, v1, fp1a_3 156 1.1.1.2 mrg ;; 157 1.1.1.2 mrg xma.l fp0b_1 = u_1, v0, f0 158 1.1.1.2 mrg xma.hu fp1a_1 = u_1, v0, f0 159 1.1.1.2 mrg ;; 160 1.1.1.2 mrg getfsig pr0_0 = fp0b_0 161 1.1.1.2 mrg xma.l fp1b_0 = uy, v1, fp1a_0 162 1.1.1.2 mrg xma.hu fp2a_0 = uy, v1, fp1a_0 163 1.1.1.2 mrg ;; 164 1.1.1.2 mrg getfsig pr1_3 = fp1b_3 165 1.1.1.2 mrg getfsig acc1_3 = fp2a_3 166 1.1.1.2 mrg xma.l fp0b_2 = u_2, v0, f0 167 1.1.1.2 mrg xma.hu fp1a_2 = u_2, v0, f0 168 1.1.1.2 mrg br L(cj4) 169 1.1.1.2 mrg 170 1.1.1.2 mrg L(gt4): xma.l fp0b_0 = uy, v0, f0 171 1.1.1.2 mrg xma.hu fp1a_0 = uy, v0, f0 172 1.1.1.2 mrg ;; 173 1.1.1.2 mrg getfsig acc0 = fp0b_3 174 1.1.1.2 mrg xma.l fp1b_3 = ux, v1, fp1a_3 175 1.1.1.2 mrg ldf8 u_3 = [up], 8 176 1.1.1.2 mrg xma.hu fp2a_3 = ux, v1, fp1a_3 177 1.1.1.2 mrg ;; 178 1.1.1.2 mrg xma.l fp0b_1 = u_1, v0, f0 179 1.1.1.2 mrg xma.hu fp1a_1 = u_1, v0, f0 180 1.1.1.2 mrg ;; 181 1.1.1.2 mrg getfsig pr0_0 = fp0b_0 182 1.1.1.2 mrg xma.l fp1b_0 = uy, v1, fp1a_0 183 1.1.1.2 mrg xma.hu fp2a_0 = uy, v1, fp1a_0 184 1.1.1.2 mrg ;; 185 1.1.1.2 mrg ldf8 u_0 = [up], 8 186 1.1.1.2 mrg getfsig pr1_3 = fp1b_3 187 1.1.1.2 mrg xma.l fp0b_2 = u_2, v0, f0 188 1.1.1.2 mrg ;; 189 1.1.1.2 mrg getfsig acc1_3 = fp2a_3 190 1.1.1.2 mrg xma.hu fp1a_2 = u_2, v0, f0 191 1.1.1.2 mrg br L(00) 192 1.1 mrg 193 1.1 mrg 194 1.1 mrg ALIGN(32) 195 1.1.1.2 mrg L(b01): ldf8 u_0 = [up], 8 C M 196 1.1.1.2 mrg mov acc1_1 = 0 C M I 197 1.1.1.2 mrg mov pr1_1 = 0 C M I 198 1.1.1.2 mrg mov pr0_2 = 0 C M I 199 1.1.1.2 mrg cmp.ne p6, p7 = r0, r0 C M I 200 1.1.1.2 mrg ;; 201 1.1.1.2 mrg xma.l fp0b_2 = ux, v0, f0 C F 202 1.1.1.2 mrg cmp.ne p10, p11 = r0, r0 C M I 203 1.1.1.2 mrg ldf8 u_1 = [up], 8 C M 204 1.1.1.2 mrg xma.hu fp1a_2 = ux, v0, f0 C F 205 1.1.1.2 mrg ;; 206 1.1.1.2 mrg xma.l fp0b_3 = uy, v0, f0 C F 207 1.1.1.2 mrg xma.hu fp1a_3 = uy, v0, f0 C F 208 1.1.1.2 mrg ;; 209 1.1.1.2 mrg getfsig acc0 = fp0b_2 C M 210 1.1.1.2 mrg xma.l fp1b_2 = ux, v1,fp1a_2 C F 211 1.1.1.2 mrg ldf8 u_2 = [up], 8 C M 212 1.1.1.2 mrg xma.hu fp2a_2 = ux, v1,fp1a_2 C F 213 1.1.1.2 mrg br.cloop.dptk L(gt5) 214 1.1.1.2 mrg 215 1.1.1.2 mrg xma.l fp0b_0 = u_0, v0, f0 C F 216 1.1.1.2 mrg xma.hu fp1a_0 = u_0, v0, f0 C F 217 1.1.1.2 mrg ;; 218 1.1.1.2 mrg getfsig pr0_3 = fp0b_3 C M 219 1.1.1.2 mrg xma.l fp1b_3 = uy, v1,fp1a_3 C F 220 1.1.1.2 mrg xma.hu fp2a_3 = uy, v1,fp1a_3 C F 221 1.1.1.2 mrg ;; 222 1.1.1.2 mrg getfsig pr1_2 = fp1b_2 C M 223 1.1.1.2 mrg getfsig acc1_2 = fp2a_2 C M 224 1.1.1.2 mrg xma.l fp0b_1 = u_1, v0, f0 C F 225 1.1.1.2 mrg xma.hu fp1a_1 = u_1, v0, f0 C F 226 1.1.1.2 mrg br L(cj5) 227 1.1.1.2 mrg 228 1.1.1.2 mrg L(gt5): xma.l fp0b_0 = u_0, v0, f0 229 1.1.1.2 mrg xma.hu fp1a_0 = u_0, v0, f0 230 1.1.1.2 mrg ;; 231 1.1.1.2 mrg getfsig pr0_3 = fp0b_3 232 1.1.1.2 mrg xma.l fp1b_3 = uy, v1, fp1a_3 233 1.1.1.2 mrg xma.hu fp2a_3 = uy, v1, fp1a_3 234 1.1.1.2 mrg ;; 235 1.1.1.2 mrg ldf8 u_3 = [up], 8 236 1.1.1.2 mrg getfsig pr1_2 = fp1b_2 237 1.1.1.2 mrg xma.l fp0b_1 = u_1, v0, f0 238 1.1.1.2 mrg ;; 239 1.1.1.2 mrg getfsig acc1_2 = fp2a_2 240 1.1.1.2 mrg xma.hu fp1a_1 = u_1, v0, f0 241 1.1.1.2 mrg br L(01) 242 1.1 mrg 243 1.1 mrg 244 1.1 mrg ALIGN(32) 245 1.1.1.2 mrg L(b10): br.cloop.dptk L(gt2) 246 1.1.1.2 mrg xma.l fp0b_1 = ux, v0, f0 247 1.1.1.2 mrg xma.hu fp1a_1 = ux, v0, f0 248 1.1.1.2 mrg ;; 249 1.1.1.2 mrg xma.l fp0b_2 = uy, v0, f0 250 1.1.1.2 mrg xma.hu fp1a_2 = uy, v0, f0 251 1.1.1.2 mrg ;; 252 1.1.1.2 mrg stf8 [rp] = fp0b_1, 8 253 1.1.1.2 mrg xma.l fp1b_1 = ux, v1, fp1a_1 254 1.1.1.2 mrg xma.hu fp2a_1 = ux, v1, fp1a_1 255 1.1.1.2 mrg ;; 256 1.1.1.2 mrg getfsig acc0 = fp0b_2 257 1.1.1.2 mrg xma.l fp1b_2 = uy, v1, fp1a_2 258 1.1.1.2 mrg xma.hu fp2a_2 = uy, v1, fp1a_2 259 1.1.1.2 mrg ;; 260 1.1.1.2 mrg getfsig pr1_1 = fp1b_1 261 1.1.1.2 mrg getfsig acc1_1 = fp2a_1 262 1.1.1.2 mrg mov ar.lc = r2 263 1.1.1.2 mrg getfsig pr1_2 = fp1b_2 264 1.1.1.2 mrg getfsig r8 = fp2a_2 265 1.1.1.2 mrg ;; 266 1.1.1.2 mrg add s0 = pr1_1, acc0 267 1.1.1.2 mrg ;; 268 1.1.1.2 mrg st8 [rp] = s0, 8 269 1.1.1.2 mrg cmp.ltu p8, p9 = s0, pr1_1 270 1.1.1.2 mrg sub r31 = -1, acc1_1 271 1.1.1.2 mrg ;; 272 1.1.1.3 mrg .pred.rel "mutex", p8, p9 273 1.1.1.2 mrg (p8) add acc0 = pr1_2, acc1_1, 1 274 1.1.1.2 mrg (p9) add acc0 = pr1_2, acc1_1 275 1.1.1.2 mrg (p8) cmp.leu p10, p0 = r31, pr1_2 276 1.1.1.2 mrg (p9) cmp.ltu p10, p0 = r31, pr1_2 277 1.1.1.2 mrg ;; 278 1.1.1.2 mrg st8 [rp] = acc0, 8 279 1.1.1.2 mrg (p10) add r8 = 1, r8 280 1.1.1.2 mrg br.ret.sptk.many b0 281 1.1.1.2 mrg 282 1.1.1.2 mrg L(gt2): ldf8 u_3 = [up], 8 283 1.1.1.2 mrg mov acc1_0 = 0 284 1.1.1.2 mrg mov pr1_0 = 0 285 1.1.1.2 mrg ;; 286 1.1.1.2 mrg mov pr0_1 = 0 287 1.1.1.2 mrg xma.l fp0b_1 = ux, v0, f0 288 1.1.1.2 mrg ldf8 u_0 = [up], 8 289 1.1.1.2 mrg xma.hu fp1a_1 = ux, v0, f0 290 1.1.1.2 mrg ;; 291 1.1.1.2 mrg xma.l fp0b_2 = uy, v0, f0 292 1.1.1.2 mrg xma.hu fp1a_2 = uy, v0, f0 293 1.1.1.2 mrg ;; 294 1.1.1.2 mrg getfsig acc0 = fp0b_1 295 1.1.1.2 mrg xma.l fp1b_1 = ux, v1, fp1a_1 296 1.1.1.2 mrg xma.hu fp2a_1 = ux, v1, fp1a_1 297 1.1.1.2 mrg ;; 298 1.1.1.2 mrg ldf8 u_1 = [up], 8 299 1.1.1.2 mrg xma.l fp0b_3 = u_3, v0, f0 300 1.1.1.2 mrg xma.hu fp1a_3 = u_3, v0, f0 301 1.1.1.2 mrg ;; 302 1.1.1.2 mrg getfsig pr0_2 = fp0b_2 303 1.1.1.2 mrg xma.l fp1b_2 = uy, v1, fp1a_2 304 1.1.1.2 mrg xma.hu fp2a_2 = uy, v1, fp1a_2 305 1.1.1.2 mrg ;; 306 1.1.1.2 mrg ldf8 u_2 = [up], 8 307 1.1.1.2 mrg getfsig pr1_1 = fp1b_1 308 1.1.1.2 mrg ;; 309 1.1.1.3 mrg {.mfi; getfsig acc1_1 = fp2a_1 310 1.1.1.2 mrg xma.l fp0b_0 = u_0, v0, f0 311 1.1.1.2 mrg cmp.ne p8, p9 = r0, r0 312 1.1.1.3 mrg }{.mfb; cmp.ne p12, p13 = r0, r0 313 1.1.1.2 mrg xma.hu fp1a_0 = u_0, v0, f0 314 1.1.1.2 mrg br L(10) 315 1.1.1.3 mrg } 316 1.1 mrg 317 1.1 mrg ALIGN(32) 318 1.1.1.2 mrg L(b11): mov acc1_3 = 0 319 1.1.1.2 mrg mov pr1_3 = 0 320 1.1.1.2 mrg mov pr0_0 = 0 321 1.1.1.2 mrg ldf8 u_2 = [up], 8 322 1.1.1.2 mrg cmp.ne p6, p7 = r0, r0 323 1.1.1.2 mrg br.cloop.dptk L(gt3) 324 1.1.1.2 mrg ;; 325 1.1.1.2 mrg xma.l fp0b_0 = ux, v0, f0 326 1.1.1.2 mrg xma.hu fp1a_0 = ux, v0, f0 327 1.1.1.2 mrg ;; 328 1.1.1.2 mrg cmp.ne p10, p11 = r0, r0 329 1.1.1.2 mrg xma.l fp0b_1 = uy, v0, f0 330 1.1.1.2 mrg xma.hu fp1a_1 = uy, v0, f0 331 1.1.1.2 mrg ;; 332 1.1.1.2 mrg getfsig acc0 = fp0b_0 333 1.1.1.2 mrg xma.l fp1b_0 = ux, v1, fp1a_0 334 1.1.1.2 mrg xma.hu fp2a_0 = ux, v1, fp1a_0 335 1.1.1.2 mrg ;; 336 1.1.1.2 mrg xma.l fp0b_2 = u_2, v0, f0 337 1.1.1.2 mrg xma.hu fp1a_2 = u_2, v0, f0 338 1.1.1.2 mrg ;; 339 1.1.1.2 mrg getfsig pr0_1 = fp0b_1 340 1.1.1.2 mrg xma.l fp1b_1 = uy, v1, fp1a_1 341 1.1.1.2 mrg xma.hu fp2a_1 = uy, v1, fp1a_1 342 1.1.1.2 mrg ;; 343 1.1.1.2 mrg getfsig pr1_0 = fp1b_0 344 1.1.1.2 mrg getfsig acc1_0 = fp2a_0 345 1.1.1.2 mrg br L(cj3) 346 1.1.1.2 mrg 347 1.1.1.2 mrg L(gt3): xma.l fp0b_0 = ux, v0, f0 348 1.1.1.2 mrg cmp.ne p10, p11 = r0, r0 349 1.1.1.2 mrg ldf8 u_3 = [up], 8 350 1.1.1.2 mrg xma.hu fp1a_0 = ux, v0, f0 351 1.1.1.2 mrg ;; 352 1.1.1.2 mrg xma.l fp0b_1 = uy, v0, f0 353 1.1.1.2 mrg xma.hu fp1a_1 = uy, v0, f0 354 1.1.1.2 mrg ;; 355 1.1.1.2 mrg getfsig acc0 = fp0b_0 356 1.1.1.2 mrg xma.l fp1b_0 = ux, v1, fp1a_0 357 1.1.1.2 mrg ldf8 u_0 = [up], 8 358 1.1.1.2 mrg xma.hu fp2a_0 = ux, v1, fp1a_0 359 1.1.1.2 mrg ;; 360 1.1.1.2 mrg xma.l fp0b_2 = u_2, v0, f0 361 1.1.1.2 mrg xma.hu fp1a_2 = u_2, v0, f0 362 1.1.1.2 mrg ;; 363 1.1.1.2 mrg getfsig pr0_1 = fp0b_1 364 1.1.1.2 mrg xma.l fp1b_1 = uy, v1, fp1a_1 365 1.1.1.2 mrg xma.hu fp2a_1 = uy, v1, fp1a_1 366 1.1.1.2 mrg ;; 367 1.1.1.2 mrg ldf8 u_1 = [up], 8 368 1.1.1.2 mrg getfsig pr1_0 = fp1b_0 369 1.1.1.2 mrg ;; 370 1.1.1.2 mrg getfsig acc1_0 = fp2a_0 371 1.1.1.2 mrg xma.l fp0b_3 = u_3, v0, f0 372 1.1.1.2 mrg xma.hu fp1a_3 = u_3, v0, f0 373 1.1.1.2 mrg br L(11) 374 1.1 mrg 375 1.1 mrg 376 1.1 mrg C *** MAIN LOOP START *** 377 1.1 mrg ALIGN(32) 378 1.1.1.2 mrg L(top): C 00 379 1.1.1.3 mrg .pred.rel "mutex", p8, p9 380 1.1.1.3 mrg .pred.rel "mutex", p12, p13 381 1.1.1.2 mrg ldf8 u_3 = [up], 8 382 1.1.1.2 mrg getfsig pr1_2 = fp1b_2 383 1.1.1.2 mrg (p8) cmp.leu p6, p7 = acc0, pr0_1 384 1.1.1.2 mrg (p9) cmp.ltu p6, p7 = acc0, pr0_1 385 1.1.1.2 mrg (p12) cmp.leu p10, p11 = s0, pr1_0 386 1.1.1.2 mrg (p13) cmp.ltu p10, p11 = s0, pr1_0 387 1.1 mrg ;; C 01 388 1.1.1.3 mrg .pred.rel "mutex", p6, p7 389 1.1.1.2 mrg getfsig acc1_2 = fp2a_2 390 1.1.1.2 mrg st8 [rp] = s0, 8 391 1.1.1.2 mrg xma.l fp0b_1 = u_1, v0, f0 392 1.1.1.2 mrg (p6) add acc0 = pr0_2, acc1_0, 1 393 1.1.1.2 mrg (p7) add acc0 = pr0_2, acc1_0 394 1.1.1.2 mrg xma.hu fp1a_1 = u_1, v0, f0 395 1.1 mrg ;; C 02 396 1.1.1.2 mrg L(01): 397 1.1.1.3 mrg .pred.rel "mutex", p10, p11 398 1.1.1.2 mrg getfsig pr0_0 = fp0b_0 399 1.1.1.2 mrg xma.l fp1b_0 = u_0, v1, fp1a_0 400 1.1.1.2 mrg (p10) add s0 = pr1_1, acc0, 1 401 1.1.1.2 mrg (p11) add s0 = pr1_1, acc0 402 1.1.1.2 mrg xma.hu fp2a_0 = u_0, v1, fp1a_0 403 1.1.1.2 mrg nop 1 404 1.1 mrg ;; C 03 405 1.1.1.3 mrg .pred.rel "mutex", p6, p7 406 1.1.1.3 mrg .pred.rel "mutex", p10, p11 407 1.1.1.2 mrg ldf8 u_0 = [up], 8 408 1.1.1.2 mrg getfsig pr1_3 = fp1b_3 409 1.1.1.2 mrg (p6) cmp.leu p8, p9 = acc0, pr0_2 410 1.1.1.2 mrg (p7) cmp.ltu p8, p9 = acc0, pr0_2 411 1.1.1.2 mrg (p10) cmp.leu p12, p13 = s0, pr1_1 412 1.1.1.2 mrg (p11) cmp.ltu p12, p13 = s0, pr1_1 413 1.1 mrg ;; C 04 414 1.1.1.3 mrg .pred.rel "mutex", p8, p9 415 1.1.1.2 mrg getfsig acc1_3 = fp2a_3 416 1.1.1.2 mrg st8 [rp] = s0, 8 417 1.1.1.2 mrg xma.l fp0b_2 = u_2, v0, f0 418 1.1.1.2 mrg (p8) add acc0 = pr0_3, acc1_1, 1 419 1.1.1.2 mrg (p9) add acc0 = pr0_3, acc1_1 420 1.1.1.2 mrg xma.hu fp1a_2 = u_2, v0, f0 421 1.1 mrg ;; C 05 422 1.1.1.2 mrg L(00): 423 1.1.1.3 mrg .pred.rel "mutex", p12, p13 424 1.1.1.2 mrg getfsig pr0_1 = fp0b_1 425 1.1.1.2 mrg xma.l fp1b_1 = u_1, v1, fp1a_1 426 1.1.1.2 mrg (p12) add s0 = pr1_2, acc0, 1 427 1.1.1.2 mrg (p13) add s0 = pr1_2, acc0 428 1.1.1.2 mrg xma.hu fp2a_1 = u_1, v1, fp1a_1 429 1.1.1.2 mrg nop 1 430 1.1 mrg ;; C 06 431 1.1.1.3 mrg .pred.rel "mutex", p8, p9 432 1.1.1.3 mrg .pred.rel "mutex", p12, p13 433 1.1.1.2 mrg ldf8 u_1 = [up], 8 434 1.1.1.2 mrg getfsig pr1_0 = fp1b_0 435 1.1.1.2 mrg (p8) cmp.leu p6, p7 = acc0, pr0_3 436 1.1.1.2 mrg (p9) cmp.ltu p6, p7 = acc0, pr0_3 437 1.1.1.2 mrg (p12) cmp.leu p10, p11 = s0, pr1_2 438 1.1.1.2 mrg (p13) cmp.ltu p10, p11 = s0, pr1_2 439 1.1 mrg ;; C 07 440 1.1.1.3 mrg .pred.rel "mutex", p6, p7 441 1.1.1.2 mrg getfsig acc1_0 = fp2a_0 442 1.1.1.2 mrg st8 [rp] = s0, 8 443 1.1.1.2 mrg xma.l fp0b_3 = u_3, v0, f0 444 1.1.1.2 mrg (p6) add acc0 = pr0_0, acc1_2, 1 445 1.1.1.2 mrg (p7) add acc0 = pr0_0, acc1_2 446 1.1.1.2 mrg xma.hu fp1a_3 = u_3, v0, f0 447 1.1 mrg ;; C 08 448 1.1.1.2 mrg L(11): 449 1.1.1.3 mrg .pred.rel "mutex", p10, p11 450 1.1.1.2 mrg getfsig pr0_2 = fp0b_2 451 1.1.1.2 mrg xma.l fp1b_2 = u_2, v1, fp1a_2 452 1.1.1.2 mrg (p10) add s0 = pr1_3, acc0, 1 453 1.1.1.2 mrg (p11) add s0 = pr1_3, acc0 454 1.1.1.2 mrg xma.hu fp2a_2 = u_2, v1, fp1a_2 455 1.1.1.2 mrg nop 1 456 1.1 mrg ;; C 09 457 1.1.1.3 mrg .pred.rel "mutex", p6, p7 458 1.1.1.3 mrg .pred.rel "mutex", p10, p11 459 1.1.1.2 mrg ldf8 u_2 = [up], 8 460 1.1.1.2 mrg getfsig pr1_1 = fp1b_1 461 1.1.1.2 mrg (p6) cmp.leu p8, p9 = acc0, pr0_0 462 1.1.1.2 mrg (p7) cmp.ltu p8, p9 = acc0, pr0_0 463 1.1.1.2 mrg (p10) cmp.leu p12, p13 = s0, pr1_3 464 1.1.1.2 mrg (p11) cmp.ltu p12, p13 = s0, pr1_3 465 1.1 mrg ;; C 10 466 1.1.1.3 mrg .pred.rel "mutex", p8, p9 467 1.1.1.2 mrg getfsig acc1_1 = fp2a_1 468 1.1.1.2 mrg st8 [rp] = s0, 8 469 1.1.1.2 mrg xma.l fp0b_0 = u_0, v0, f0 470 1.1.1.2 mrg (p8) add acc0 = pr0_1, acc1_3, 1 471 1.1.1.2 mrg (p9) add acc0 = pr0_1, acc1_3 472 1.1.1.2 mrg xma.hu fp1a_0 = u_0, v0, f0 473 1.1 mrg ;; C 11 474 1.1.1.2 mrg L(10): 475 1.1.1.3 mrg .pred.rel "mutex", p12, p13 476 1.1.1.2 mrg getfsig pr0_3 = fp0b_3 477 1.1.1.2 mrg xma.l fp1b_3 = u_3, v1, fp1a_3 478 1.1.1.2 mrg (p12) add s0 = pr1_0, acc0, 1 479 1.1.1.2 mrg (p13) add s0 = pr1_0, acc0 480 1.1.1.2 mrg xma.hu fp2a_3 = u_3, v1, fp1a_3 481 1.1.1.2 mrg br.cloop.dptk L(top) 482 1.1 mrg ;; 483 1.1 mrg C *** MAIN LOOP END *** 484 1.1 mrg 485 1.1.1.3 mrg .pred.rel "mutex", p8, p9 486 1.1.1.3 mrg .pred.rel "mutex", p12, p13 487 1.1.1.3 mrg {.mmi; getfsig pr1_2 = fp1b_2 488 1.1.1.2 mrg st8 [rp] = s0, 8 489 1.1.1.2 mrg (p8) cmp.leu p6, p7 = acc0, pr0_1 490 1.1.1.3 mrg }{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 491 1.1.1.2 mrg (p12) cmp.leu p10, p11 = s0, pr1_0 492 1.1.1.2 mrg (p13) cmp.ltu p10, p11 = s0, pr1_0 493 1.1.1.2 mrg ;; 494 1.1.1.3 mrg } .pred.rel "mutex", p6, p7 495 1.1.1.3 mrg {.mfi; getfsig acc1_2 = fp2a_2 496 1.1.1.2 mrg xma.l fp0b_1 = u_1, v0, f0 497 1.1.1.2 mrg nop 1 498 1.1.1.3 mrg }{.mmf; (p6) add acc0 = pr0_2, acc1_0, 1 499 1.1.1.2 mrg (p7) add acc0 = pr0_2, acc1_0 500 1.1.1.2 mrg xma.hu fp1a_1 = u_1, v0, f0 501 1.1.1.2 mrg ;; 502 1.1.1.3 mrg } 503 1.1.1.2 mrg L(cj5): 504 1.1.1.3 mrg .pred.rel "mutex", p10, p11 505 1.1.1.3 mrg {.mfi; getfsig pr0_0 = fp0b_0 506 1.1.1.2 mrg xma.l fp1b_0 = u_0, v1, fp1a_0 507 1.1.1.2 mrg (p10) add s0 = pr1_1, acc0, 1 508 1.1.1.3 mrg }{.mfi; (p11) add s0 = pr1_1, acc0 509 1.1.1.2 mrg xma.hu fp2a_0 = u_0, v1, fp1a_0 510 1.1.1.2 mrg nop 1 511 1.1.1.2 mrg ;; 512 1.1.1.3 mrg } .pred.rel "mutex", p6, p7 513 1.1.1.3 mrg .pred.rel "mutex", p10, p11 514 1.1.1.3 mrg {.mmi; getfsig pr1_3 = fp1b_3 515 1.1.1.2 mrg st8 [rp] = s0, 8 516 1.1.1.2 mrg (p6) cmp.leu p8, p9 = acc0, pr0_2 517 1.1.1.3 mrg }{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 518 1.1.1.2 mrg (p10) cmp.leu p12, p13 = s0, pr1_1 519 1.1.1.2 mrg (p11) cmp.ltu p12, p13 = s0, pr1_1 520 1.1.1.2 mrg ;; 521 1.1.1.3 mrg } .pred.rel "mutex", p8, p9 522 1.1.1.3 mrg {.mfi; getfsig acc1_3 = fp2a_3 523 1.1.1.2 mrg xma.l fp0b_2 = u_2, v0, f0 524 1.1.1.2 mrg nop 1 525 1.1.1.3 mrg }{.mmf; (p8) add acc0 = pr0_3, acc1_1, 1 526 1.1.1.2 mrg (p9) add acc0 = pr0_3, acc1_1 527 1.1.1.2 mrg xma.hu fp1a_2 = u_2, v0, f0 528 1.1.1.2 mrg ;; 529 1.1.1.3 mrg } 530 1.1.1.2 mrg L(cj4): 531 1.1.1.3 mrg .pred.rel "mutex", p12, p13 532 1.1.1.3 mrg {.mfi; getfsig pr0_1 = fp0b_1 533 1.1.1.2 mrg xma.l fp1b_1 = u_1, v1, fp1a_1 534 1.1.1.2 mrg (p12) add s0 = pr1_2, acc0, 1 535 1.1.1.3 mrg }{.mfi; (p13) add s0 = pr1_2, acc0 536 1.1.1.2 mrg xma.hu fp2a_1 = u_1, v1, fp1a_1 537 1.1.1.2 mrg nop 1 538 1.1.1.2 mrg ;; 539 1.1.1.3 mrg } .pred.rel "mutex", p8, p9 540 1.1.1.3 mrg .pred.rel "mutex", p12, p13 541 1.1.1.3 mrg {.mmi; getfsig pr1_0 = fp1b_0 542 1.1.1.2 mrg st8 [rp] = s0, 8 543 1.1.1.2 mrg (p8) cmp.leu p6, p7 = acc0, pr0_3 544 1.1.1.3 mrg }{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_3 545 1.1.1.2 mrg (p12) cmp.leu p10, p11 = s0, pr1_2 546 1.1.1.2 mrg (p13) cmp.ltu p10, p11 = s0, pr1_2 547 1.1.1.2 mrg ;; 548 1.1.1.3 mrg } .pred.rel "mutex", p6, p7 549 1.1.1.3 mrg {.mmi; getfsig acc1_0 = fp2a_0 550 1.1.1.2 mrg (p6) add acc0 = pr0_0, acc1_2, 1 551 1.1.1.2 mrg (p7) add acc0 = pr0_0, acc1_2 552 1.1.1.2 mrg ;; 553 1.1.1.3 mrg } 554 1.1.1.2 mrg L(cj3): 555 1.1.1.3 mrg .pred.rel "mutex", p10, p11 556 1.1.1.3 mrg {.mfi; getfsig pr0_2 = fp0b_2 557 1.1.1.2 mrg xma.l fp1b_2 = u_2, v1, fp1a_2 558 1.1.1.2 mrg (p10) add s0 = pr1_3, acc0, 1 559 1.1.1.3 mrg }{.mfi; (p11) add s0 = pr1_3, acc0 560 1.1.1.2 mrg xma.hu fp2a_2 = u_2, v1, fp1a_2 561 1.1.1.2 mrg nop 1 562 1.1.1.2 mrg ;; 563 1.1.1.3 mrg } .pred.rel "mutex", p6, p7 564 1.1.1.3 mrg .pred.rel "mutex", p10, p11 565 1.1.1.3 mrg {.mmi; getfsig pr1_1 = fp1b_1 566 1.1.1.2 mrg st8 [rp] = s0, 8 567 1.1.1.2 mrg (p6) cmp.leu p8, p9 = acc0, pr0_0 568 1.1.1.3 mrg }{.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_0 569 1.1.1.2 mrg (p10) cmp.leu p12, p13 = s0, pr1_3 570 1.1.1.2 mrg (p11) cmp.ltu p12, p13 = s0, pr1_3 571 1.1.1.2 mrg ;; 572 1.1.1.3 mrg } .pred.rel "mutex", p8, p9 573 1.1.1.3 mrg {.mmi; getfsig acc1_1 = fp2a_1 574 1.1.1.2 mrg (p8) add acc0 = pr0_1, acc1_3, 1 575 1.1.1.2 mrg (p9) add acc0 = pr0_1, acc1_3 576 1.1.1.2 mrg ;; 577 1.1.1.3 mrg } .pred.rel "mutex", p12, p13 578 1.1.1.3 mrg {.mmi; (p12) add s0 = pr1_0, acc0, 1 579 1.1.1.2 mrg (p13) add s0 = pr1_0, acc0 580 1.1.1.2 mrg nop 1 581 1.1.1.2 mrg ;; 582 1.1.1.3 mrg } .pred.rel "mutex", p8, p9 583 1.1.1.3 mrg .pred.rel "mutex", p12, p13 584 1.1.1.3 mrg {.mmi; getfsig pr1_2 = fp1b_2 585 1.1.1.2 mrg st8 [rp] = s0, 8 586 1.1.1.2 mrg (p8) cmp.leu p6, p7 = acc0, pr0_1 587 1.1.1.3 mrg }{.mmi; (p9) cmp.ltu p6, p7 = acc0, pr0_1 588 1.1.1.2 mrg (p12) cmp.leu p10, p11 = s0, pr1_0 589 1.1.1.2 mrg (p13) cmp.ltu p10, p11 = s0, pr1_0 590 1.1.1.2 mrg ;; 591 1.1.1.3 mrg } .pred.rel "mutex", p6, p7 592 1.1.1.3 mrg {.mmi; getfsig r8 = fp2a_2 593 1.1.1.2 mrg (p6) add acc0 = pr0_2, acc1_0, 1 594 1.1.1.2 mrg (p7) add acc0 = pr0_2, acc1_0 595 1.1.1.2 mrg ;; 596 1.1.1.3 mrg } .pred.rel "mutex", p10, p11 597 1.1.1.3 mrg {.mmi; (p10) add s0 = pr1_1, acc0, 1 598 1.1.1.2 mrg (p11) add s0 = pr1_1, acc0 599 1.1.1.2 mrg (p6) cmp.leu p8, p9 = acc0, pr0_2 600 1.1.1.2 mrg ;; 601 1.1.1.3 mrg } .pred.rel "mutex", p10, p11 602 1.1.1.3 mrg {.mmi; (p7) cmp.ltu p8, p9 = acc0, pr0_2 603 1.1.1.2 mrg (p10) cmp.leu p12, p13 = s0, pr1_1 604 1.1.1.2 mrg (p11) cmp.ltu p12, p13 = s0, pr1_1 605 1.1.1.2 mrg ;; 606 1.1.1.3 mrg } .pred.rel "mutex", p8, p9 607 1.1.1.3 mrg {.mmi; st8 [rp] = s0, 8 608 1.1.1.2 mrg (p8) add acc0 = pr1_2, acc1_1, 1 609 1.1.1.2 mrg (p9) add acc0 = pr1_2, acc1_1 610 1.1.1.2 mrg ;; 611 1.1.1.3 mrg } .pred.rel "mutex", p8, p9 612 1.1.1.3 mrg {.mmi; (p8) cmp.leu p10, p11 = acc0, pr1_2 613 1.1.1.2 mrg (p9) cmp.ltu p10, p11 = acc0, pr1_2 614 1.1.1.2 mrg (p12) add acc0 = 1, acc0 615 1.1.1.2 mrg ;; 616 1.1.1.3 mrg }{.mmi; st8 [rp] = acc0, 8 617 1.1.1.2 mrg (p12) cmpeqor p10, p0 = 0, acc0 618 1.1.1.2 mrg nop 1 619 1.1.1.2 mrg ;; 620 1.1.1.3 mrg }{.mib; (p10) add r8 = 1, r8 621 1.1.1.2 mrg mov ar.lc = r2 622 1.1.1.2 mrg br.ret.sptk.many b0 623 1.1.1.3 mrg } 624 1.1 mrg EPILOGUE() 625 1.1 mrg ASM_END() 626