1 1.1 mrg dnl AMD64 mpn_mulmid_basecase 2 1.1 mrg 3 1.1 mrg dnl Contributed by David Harvey. 4 1.1 mrg 5 1.1 mrg dnl Copyright 2011, 2012 Free Software Foundation, Inc. 6 1.1 mrg 7 1.1 mrg dnl This file is part of the GNU MP Library. 8 1.1 mrg dnl 9 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 1.1 mrg dnl it under the terms of either: 11 1.1 mrg dnl 12 1.1 mrg dnl * the GNU Lesser General Public License as published by the Free 13 1.1 mrg dnl Software Foundation; either version 3 of the License, or (at your 14 1.1 mrg dnl option) any later version. 15 1.1 mrg dnl 16 1.1 mrg dnl or 17 1.1 mrg dnl 18 1.1 mrg dnl * the GNU General Public License as published by the Free Software 19 1.1 mrg dnl Foundation; either version 2 of the License, or (at your option) any 20 1.1 mrg dnl later version. 21 1.1 mrg dnl 22 1.1 mrg dnl or both in parallel, as here. 23 1.1 mrg dnl 24 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 1.1 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 1.1 mrg dnl for more details. 28 1.1 mrg dnl 29 1.1 mrg dnl You should have received copies of the GNU General Public License and the 30 1.1 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 1.1 mrg dnl see https://www.gnu.org/licenses/. 32 1.1 mrg 33 1.1 mrg 34 1.1 mrg include(`../config.m4') 35 1.1 mrg 36 1.1 mrg C cycles/limb 37 1.1 mrg C K8,K9: 2.375 (2.5 when un - vn is "small") 38 1.1 mrg C K10: ? 39 1.1 mrg C P4: ? 40 1.1 mrg C P6-15: ? 41 1.1 mrg 42 1.1 mrg C INPUT PARAMETERS 43 1.1 mrg define(`rp', `%rdi') 44 1.1 mrg define(`up', `%rsi') 45 1.1 mrg define(`un_param',`%rdx') 46 1.1 mrg define(`vp_param',`%rcx') 47 1.1 mrg define(`vn', `%r8') 48 1.1 mrg 49 1.1 mrg define(`v0', `%r12') 50 1.1 mrg define(`v1', `%r9') 51 1.1 mrg 52 1.1 mrg define(`w0', `%rbx') 53 1.1 mrg define(`w1', `%rcx') 54 1.1 mrg define(`w2', `%rbp') 55 1.1 mrg define(`w3', `%r10') 56 1.1 mrg 57 1.1 mrg define(`n', `%r11') 58 1.1 mrg define(`outer_addr', `%r14') 59 1.1 mrg define(`un', `%r13') 60 1.1 mrg define(`vp', `%r15') 61 1.1 mrg 62 1.1 mrg define(`vp_inner', `%r10') 63 1.1 mrg 64 1.1 mrg ABI_SUPPORT(DOS64) 65 1.1 mrg ABI_SUPPORT(STD64) 66 1.1 mrg 67 1.1 mrg ASM_START() 68 1.1 mrg TEXT 69 1.1 mrg ALIGN(16) 70 1.1 mrg PROLOGUE(mpn_mulmid_basecase) 71 1.1 mrg FUNC_ENTRY(4) 72 1.1 mrg IFDOS(` mov 56(%rsp), %r8d ') 73 1.1 mrg push %rbx 74 1.1 mrg push %rbp 75 1.1 mrg push %r12 76 1.1 mrg push %r13 77 1.1 mrg push %r14 78 1.1 mrg push %r15 79 1.1 mrg 80 1.1 mrg mov vp_param, vp 81 1.1 mrg 82 1.1 mrg C use un for row length (= un_param - vn + 1) 83 1.1 mrg lea 1(un_param), un 84 1.1 mrg sub vn, un 85 1.1 mrg 86 1.1 mrg lea (rp,un,8), rp 87 1.1 mrg 88 1.1 mrg cmp $4, un C TODO: needs tuning 89 1.1 mrg jc L(diagonal) 90 1.1 mrg 91 1.1 mrg lea (up,un_param,8), up 92 1.1 mrg 93 1.1 mrg test $1, vn 94 1.1 mrg jz L(mul_2) 95 1.1 mrg 96 1.1 mrg C =========================================================== 97 1.1 mrg C mul_1 for vp[0] if vn is odd 98 1.1 mrg 99 1.1 mrg L(mul_1): 100 1.1 mrg mov R32(un), R32(w0) 101 1.1 mrg 102 1.1 mrg neg un 103 1.1 mrg mov (up,un,8), %rax 104 1.1 mrg mov (vp), v0 105 1.1 mrg mul v0 106 1.1 mrg 107 1.1 mrg and $-4, un C round down to multiple of 4 108 1.1 mrg mov un, n 109 1.1 mrg 110 1.1 mrg and $3, R32(w0) 111 1.1 mrg jz L(mul_1_prologue_0) 112 1.1 mrg cmp $2, R32(w0) 113 1.1 mrg jc L(mul_1_prologue_1) 114 1.1 mrg jz L(mul_1_prologue_2) 115 1.1 mrg 116 1.1 mrg L(mul_1_prologue_3): 117 1.1 mrg mov %rax, w3 118 1.1 mrg mov %rdx, w0 119 1.1 mrg lea L(addmul_prologue_3)(%rip), outer_addr 120 1.1 mrg jmp L(mul_1_entry_3) 121 1.1 mrg 122 1.1 mrg ALIGN(16) 123 1.1 mrg L(mul_1_prologue_0): 124 1.1 mrg mov %rax, w2 125 1.1 mrg mov %rdx, w3 C note already w0 == 0 126 1.1 mrg lea L(addmul_prologue_0)(%rip), outer_addr 127 1.1 mrg jmp L(mul_1_entry_0) 128 1.1 mrg 129 1.1 mrg ALIGN(16) 130 1.1 mrg L(mul_1_prologue_1): 131 1.1 mrg add $4, n 132 1.1 mrg mov %rax, w1 133 1.1 mrg mov %rdx, w2 134 1.1 mrg mov $0, R32(w3) 135 1.1 mrg mov (up,n,8), %rax 136 1.1 mrg lea L(addmul_prologue_1)(%rip), outer_addr 137 1.1 mrg jmp L(mul_1_entry_1) 138 1.1 mrg 139 1.1 mrg ALIGN(16) 140 1.1 mrg L(mul_1_prologue_2): 141 1.1 mrg mov %rax, w0 142 1.1 mrg mov %rdx, w1 143 1.1 mrg mov 24(up,n,8), %rax 144 1.1 mrg mov $0, R32(w2) 145 1.1 mrg mov $0, R32(w3) 146 1.1 mrg lea L(addmul_prologue_2)(%rip), outer_addr 147 1.1 mrg jmp L(mul_1_entry_2) 148 1.1 mrg 149 1.1 mrg 150 1.1 mrg C this loop is 10 c/loop = 2.5 c/l on K8 151 1.1 mrg 152 1.1 mrg ALIGN(16) 153 1.1 mrg L(mul_1_top): 154 1.1 mrg mov w0, -16(rp,n,8) 155 1.1 mrg add %rax, w1 156 1.1 mrg mov (up,n,8), %rax 157 1.1 mrg adc %rdx, w2 158 1.1 mrg L(mul_1_entry_1): 159 1.1 mrg mov $0, R32(w0) 160 1.1 mrg mul v0 161 1.1 mrg mov w1, -8(rp,n,8) 162 1.1 mrg add %rax, w2 163 1.1 mrg adc %rdx, w3 164 1.1 mrg L(mul_1_entry_0): 165 1.1 mrg mov 8(up,n,8), %rax 166 1.1 mrg mul v0 167 1.1 mrg mov w2, (rp,n,8) 168 1.1 mrg add %rax, w3 169 1.1 mrg adc %rdx, w0 170 1.1 mrg L(mul_1_entry_3): 171 1.1 mrg mov 16(up,n,8), %rax 172 1.1 mrg mul v0 173 1.1 mrg mov w3, 8(rp,n,8) 174 1.1 mrg mov $0, R32(w2) C zero 175 1.1 mrg mov w2, w3 C zero 176 1.1 mrg add %rax, w0 177 1.1 mrg mov 24(up,n,8), %rax 178 1.1 mrg mov w2, w1 C zero 179 1.1 mrg adc %rdx, w1 180 1.1 mrg L(mul_1_entry_2): 181 1.1 mrg mul v0 182 1.1 mrg add $4, n 183 1.1 mrg js L(mul_1_top) 184 1.1 mrg 185 1.1 mrg mov w0, -16(rp) 186 1.1 mrg add %rax, w1 187 1.1 mrg mov w1, -8(rp) 188 1.1 mrg mov w2, 8(rp) C zero last limb of output 189 1.1 mrg adc %rdx, w2 190 1.1 mrg mov w2, (rp) 191 1.1 mrg 192 1.1 mrg dec vn 193 1.1 mrg jz L(ret) 194 1.1 mrg 195 1.1 mrg lea -8(up), up 196 1.1 mrg lea 8(vp), vp 197 1.1 mrg 198 1.1 mrg mov un, n 199 1.1 mrg mov (vp), v0 200 1.1 mrg mov 8(vp), v1 201 1.1 mrg 202 1.1 mrg jmp *outer_addr 203 1.1 mrg 204 1.1 mrg C =========================================================== 205 1.1 mrg C mul_2 for vp[0], vp[1] if vn is even 206 1.1 mrg 207 1.1 mrg ALIGN(16) 208 1.1 mrg L(mul_2): 209 1.1 mrg mov R32(un), R32(w0) 210 1.1 mrg 211 1.1 mrg neg un 212 1.1 mrg mov -8(up,un,8), %rax 213 1.1 mrg mov (vp), v0 214 1.1 mrg mov 8(vp), v1 215 1.1 mrg mul v1 216 1.1 mrg 217 1.1 mrg and $-4, un C round down to multiple of 4 218 1.1 mrg mov un, n 219 1.1 mrg 220 1.1 mrg and $3, R32(w0) 221 1.1 mrg jz L(mul_2_prologue_0) 222 1.1 mrg cmp $2, R32(w0) 223 1.1 mrg jc L(mul_2_prologue_1) 224 1.1 mrg jz L(mul_2_prologue_2) 225 1.1 mrg 226 1.1 mrg L(mul_2_prologue_3): 227 1.1 mrg mov %rax, w1 228 1.1 mrg mov %rdx, w2 229 1.1 mrg lea L(addmul_prologue_3)(%rip), outer_addr 230 1.1 mrg jmp L(mul_2_entry_3) 231 1.1 mrg 232 1.1 mrg ALIGN(16) 233 1.1 mrg L(mul_2_prologue_0): 234 1.1 mrg mov %rax, w0 235 1.1 mrg mov %rdx, w1 236 1.1 mrg lea L(addmul_prologue_0)(%rip), outer_addr 237 1.1 mrg jmp L(mul_2_entry_0) 238 1.1 mrg 239 1.1 mrg ALIGN(16) 240 1.1 mrg L(mul_2_prologue_1): 241 1.1 mrg mov %rax, w3 242 1.1 mrg mov %rdx, w0 243 1.1 mrg mov $0, R32(w1) 244 1.1 mrg lea L(addmul_prologue_1)(%rip), outer_addr 245 1.1 mrg jmp L(mul_2_entry_1) 246 1.1 mrg 247 1.1 mrg ALIGN(16) 248 1.1 mrg L(mul_2_prologue_2): 249 1.1 mrg mov %rax, w2 250 1.1 mrg mov %rdx, w3 251 1.1 mrg mov $0, R32(w0) 252 1.1 mrg mov 16(up,n,8), %rax 253 1.1 mrg lea L(addmul_prologue_2)(%rip), outer_addr 254 1.1 mrg jmp L(mul_2_entry_2) 255 1.1 mrg 256 1.1 mrg 257 1.1 mrg C this loop is 18 c/loop = 2.25 c/l on K8 258 1.1 mrg 259 1.1 mrg ALIGN(16) 260 1.1 mrg L(mul_2_top): 261 1.1 mrg mov -8(up,n,8), %rax 262 1.1 mrg mul v1 263 1.1 mrg add %rax, w0 264 1.1 mrg adc %rdx, w1 265 1.1 mrg L(mul_2_entry_0): 266 1.1 mrg mov $0, R32(w2) 267 1.1 mrg mov (up,n,8), %rax 268 1.1 mrg mul v0 269 1.1 mrg add %rax, w0 270 1.1 mrg mov (up,n,8), %rax 271 1.1 mrg adc %rdx, w1 272 1.1 mrg adc $0, R32(w2) 273 1.1 mrg mul v1 274 1.1 mrg add %rax, w1 275 1.1 mrg mov w0, (rp,n,8) 276 1.1 mrg adc %rdx, w2 277 1.1 mrg L(mul_2_entry_3): 278 1.1 mrg mov 8(up,n,8), %rax 279 1.1 mrg mul v0 280 1.1 mrg mov $0, R32(w3) 281 1.1 mrg add %rax, w1 282 1.1 mrg adc %rdx, w2 283 1.1 mrg mov $0, R32(w0) 284 1.1 mrg adc $0, R32(w3) 285 1.1 mrg mov 8(up,n,8), %rax 286 1.1 mrg mov w1, 8(rp,n,8) 287 1.1 mrg mul v1 288 1.1 mrg add %rax, w2 289 1.1 mrg mov 16(up,n,8), %rax 290 1.1 mrg adc %rdx, w3 291 1.1 mrg L(mul_2_entry_2): 292 1.1 mrg mov $0, R32(w1) 293 1.1 mrg mul v0 294 1.1 mrg add %rax, w2 295 1.1 mrg mov 16(up,n,8), %rax 296 1.1 mrg adc %rdx, w3 297 1.1 mrg adc $0, R32(w0) 298 1.1 mrg mul v1 299 1.1 mrg add %rax, w3 300 1.1 mrg mov w2, 16(rp,n,8) 301 1.1 mrg adc %rdx, w0 302 1.1 mrg L(mul_2_entry_1): 303 1.1 mrg mov 24(up,n,8), %rax 304 1.1 mrg mul v0 305 1.1 mrg add %rax, w3 306 1.1 mrg adc %rdx, w0 307 1.1 mrg adc $0, R32(w1) 308 1.1 mrg add $4, n 309 1.1 mrg mov w3, -8(rp,n,8) 310 1.1 mrg jnz L(mul_2_top) 311 1.1 mrg 312 1.1 mrg mov w0, (rp) 313 1.1 mrg mov w1, 8(rp) 314 1.1 mrg 315 1.1 mrg sub $2, vn 316 1.1 mrg jz L(ret) 317 1.1 mrg 318 1.1 mrg lea 16(vp), vp 319 1.1 mrg lea -16(up), up 320 1.1 mrg 321 1.1 mrg mov un, n 322 1.1 mrg mov (vp), v0 323 1.1 mrg mov 8(vp), v1 324 1.1 mrg 325 1.1 mrg jmp *outer_addr 326 1.1 mrg 327 1.1 mrg C =========================================================== 328 1.1 mrg C addmul_2 for remaining vp's 329 1.1 mrg 330 1.1 mrg ALIGN(16) 331 1.1 mrg L(addmul_prologue_0): 332 1.1 mrg mov -8(up,n,8), %rax 333 1.1 mrg mul v1 334 1.1 mrg mov %rax, w1 335 1.1 mrg mov %rdx, w2 336 1.1 mrg mov $0, R32(w3) 337 1.1 mrg jmp L(addmul_entry_0) 338 1.1 mrg 339 1.1 mrg ALIGN(16) 340 1.1 mrg L(addmul_prologue_1): 341 1.1 mrg mov 16(up,n,8), %rax 342 1.1 mrg mul v1 343 1.1 mrg mov %rax, w0 344 1.1 mrg mov %rdx, w1 345 1.1 mrg mov $0, R32(w2) 346 1.1 mrg mov 24(up,n,8), %rax 347 1.1 mrg jmp L(addmul_entry_1) 348 1.1 mrg 349 1.1 mrg ALIGN(16) 350 1.1 mrg L(addmul_prologue_2): 351 1.1 mrg mov 8(up,n,8), %rax 352 1.1 mrg mul v1 353 1.1 mrg mov %rax, w3 354 1.1 mrg mov %rdx, w0 355 1.1 mrg mov $0, R32(w1) 356 1.1 mrg jmp L(addmul_entry_2) 357 1.1 mrg 358 1.1 mrg ALIGN(16) 359 1.1 mrg L(addmul_prologue_3): 360 1.1 mrg mov (up,n,8), %rax 361 1.1 mrg mul v1 362 1.1 mrg mov %rax, w2 363 1.1 mrg mov %rdx, w3 364 1.1 mrg mov $0, R32(w0) 365 1.1 mrg mov $0, R32(w1) 366 1.1 mrg jmp L(addmul_entry_3) 367 1.1 mrg 368 1.1 mrg C this loop is 19 c/loop = 2.375 c/l on K8 369 1.1 mrg 370 1.1 mrg ALIGN(16) 371 1.1 mrg L(addmul_top): 372 1.1 mrg mov $0, R32(w3) 373 1.1 mrg add %rax, w0 374 1.1 mrg mov -8(up,n,8), %rax 375 1.1 mrg adc %rdx, w1 376 1.1 mrg adc $0, R32(w2) 377 1.1 mrg mul v1 378 1.1 mrg add w0, -8(rp,n,8) 379 1.1 mrg adc %rax, w1 380 1.1 mrg adc %rdx, w2 381 1.1 mrg L(addmul_entry_0): 382 1.1 mrg mov (up,n,8), %rax 383 1.1 mrg mul v0 384 1.1 mrg add %rax, w1 385 1.1 mrg mov (up,n,8), %rax 386 1.1 mrg adc %rdx, w2 387 1.1 mrg adc $0, R32(w3) 388 1.1 mrg mul v1 389 1.1 mrg add w1, (rp,n,8) 390 1.1 mrg mov $0, R32(w1) 391 1.1 mrg adc %rax, w2 392 1.1 mrg mov $0, R32(w0) 393 1.1 mrg adc %rdx, w3 394 1.1 mrg L(addmul_entry_3): 395 1.1 mrg mov 8(up,n,8), %rax 396 1.1 mrg mul v0 397 1.1 mrg add %rax, w2 398 1.1 mrg mov 8(up,n,8), %rax 399 1.1 mrg adc %rdx, w3 400 1.1 mrg adc $0, R32(w0) 401 1.1 mrg mul v1 402 1.1 mrg add w2, 8(rp,n,8) 403 1.1 mrg adc %rax, w3 404 1.1 mrg adc %rdx, w0 405 1.1 mrg L(addmul_entry_2): 406 1.1 mrg mov 16(up,n,8), %rax 407 1.1 mrg mul v0 408 1.1 mrg add %rax, w3 409 1.1 mrg mov 16(up,n,8), %rax 410 1.1 mrg adc %rdx, w0 411 1.1 mrg adc $0, R32(w1) 412 1.1 mrg mul v1 413 1.1 mrg add w3, 16(rp,n,8) 414 1.1 mrg nop C don't ask... 415 1.1 mrg adc %rax, w0 416 1.1 mrg mov $0, R32(w2) 417 1.1 mrg mov 24(up,n,8), %rax 418 1.1 mrg adc %rdx, w1 419 1.1 mrg L(addmul_entry_1): 420 1.1 mrg mul v0 421 1.1 mrg add $4, n 422 1.1 mrg jnz L(addmul_top) 423 1.1 mrg 424 1.1 mrg add %rax, w0 425 1.1 mrg adc %rdx, w1 426 1.1 mrg adc $0, R32(w2) 427 1.1 mrg 428 1.1 mrg add w0, -8(rp) 429 1.1 mrg adc w1, (rp) 430 1.1 mrg adc w2, 8(rp) 431 1.1 mrg 432 1.1 mrg sub $2, vn 433 1.1 mrg jz L(ret) 434 1.1 mrg 435 1.1 mrg lea 16(vp), vp 436 1.1 mrg lea -16(up), up 437 1.1 mrg 438 1.1 mrg mov un, n 439 1.1 mrg mov (vp), v0 440 1.1 mrg mov 8(vp), v1 441 1.1 mrg 442 1.1 mrg jmp *outer_addr 443 1.1 mrg 444 1.1 mrg C =========================================================== 445 1.1 mrg C accumulate along diagonals if un - vn is small 446 1.1 mrg 447 1.1 mrg ALIGN(16) 448 1.1 mrg L(diagonal): 449 1.1 mrg xor R32(w0), R32(w0) 450 1.1 mrg xor R32(w1), R32(w1) 451 1.1 mrg xor R32(w2), R32(w2) 452 1.1 mrg 453 1.1 mrg neg un 454 1.1 mrg 455 1.1 mrg mov R32(vn), %eax 456 1.1 mrg and $3, %eax 457 1.1 mrg jz L(diag_prologue_0) 458 1.1 mrg cmp $2, %eax 459 1.1 mrg jc L(diag_prologue_1) 460 1.1 mrg jz L(diag_prologue_2) 461 1.1 mrg 462 1.1 mrg L(diag_prologue_3): 463 1.1 mrg lea -8(vp), vp 464 1.1 mrg mov vp, vp_inner 465 1.1 mrg add $1, vn 466 1.1 mrg mov vn, n 467 1.1 mrg lea L(diag_entry_3)(%rip), outer_addr 468 1.1 mrg jmp L(diag_entry_3) 469 1.1 mrg 470 1.1 mrg L(diag_prologue_0): 471 1.1 mrg mov vp, vp_inner 472 1.1 mrg mov vn, n 473 1.1 mrg lea 0(%rip), outer_addr 474 1.1 mrg mov -8(up,n,8), %rax 475 1.1 mrg jmp L(diag_entry_0) 476 1.1 mrg 477 1.1 mrg L(diag_prologue_1): 478 1.1 mrg lea 8(vp), vp 479 1.1 mrg mov vp, vp_inner 480 1.1 mrg add $3, vn 481 1.1 mrg mov vn, n 482 1.1 mrg lea 0(%rip), outer_addr 483 1.1 mrg mov -8(vp_inner), %rax 484 1.1 mrg jmp L(diag_entry_1) 485 1.1 mrg 486 1.1 mrg L(diag_prologue_2): 487 1.1 mrg lea -16(vp), vp 488 1.1 mrg mov vp, vp_inner 489 1.1 mrg add $2, vn 490 1.1 mrg mov vn, n 491 1.1 mrg lea 0(%rip), outer_addr 492 1.1 mrg mov 16(vp_inner), %rax 493 1.1 mrg jmp L(diag_entry_2) 494 1.1 mrg 495 1.1 mrg 496 1.1 mrg C this loop is 10 c/loop = 2.5 c/l on K8 497 1.1 mrg 498 1.1 mrg ALIGN(16) 499 1.1 mrg L(diag_top): 500 1.1 mrg add %rax, w0 501 1.1 mrg adc %rdx, w1 502 1.1 mrg mov -8(up,n,8), %rax 503 1.1 mrg adc $0, w2 504 1.1 mrg L(diag_entry_0): 505 1.1 mrg mulq (vp_inner) 506 1.1 mrg add %rax, w0 507 1.1 mrg adc %rdx, w1 508 1.1 mrg adc $0, w2 509 1.1 mrg L(diag_entry_3): 510 1.1 mrg mov -16(up,n,8), %rax 511 1.1 mrg mulq 8(vp_inner) 512 1.1 mrg add %rax, w0 513 1.1 mrg mov 16(vp_inner), %rax 514 1.1 mrg adc %rdx, w1 515 1.1 mrg adc $0, w2 516 1.1 mrg L(diag_entry_2): 517 1.1 mrg mulq -24(up,n,8) 518 1.1 mrg add %rax, w0 519 1.1 mrg mov 24(vp_inner), %rax 520 1.1 mrg adc %rdx, w1 521 1.1 mrg lea 32(vp_inner), vp_inner 522 1.1 mrg adc $0, w2 523 1.1 mrg L(diag_entry_1): 524 1.1 mrg mulq -32(up,n,8) 525 1.1 mrg sub $4, n 526 1.1 mrg jnz L(diag_top) 527 1.1 mrg 528 1.1 mrg add %rax, w0 529 1.1 mrg adc %rdx, w1 530 1.1 mrg adc $0, w2 531 1.1 mrg 532 1.1 mrg mov w0, (rp,un,8) 533 1.1 mrg 534 1.1 mrg inc un 535 1.1 mrg jz L(diag_end) 536 1.1 mrg 537 1.1 mrg mov vn, n 538 1.1 mrg mov vp, vp_inner 539 1.1 mrg 540 1.1 mrg lea 8(up), up 541 1.1 mrg mov w1, w0 542 1.1 mrg mov w2, w1 543 1.1 mrg xor R32(w2), R32(w2) 544 1.1 mrg 545 1.1 mrg jmp *outer_addr 546 1.1 mrg 547 1.1 mrg L(diag_end): 548 1.1 mrg mov w1, (rp) 549 1.1 mrg mov w2, 8(rp) 550 1.1 mrg 551 1.1 mrg L(ret): pop %r15 552 1.1 mrg pop %r14 553 1.1 mrg pop %r13 554 1.1 mrg pop %r12 555 1.1 mrg pop %rbp 556 1.1 mrg pop %rbx 557 1.1 mrg FUNC_EXIT() 558 1.1 mrg ret 559 1.1 mrg EPILOGUE() 560