1 1.1 mrg dnl AMD64 mpn_gcd_22. Assumes useless bsf, useless shrd, tzcnt, no shlx. 2 1.1 mrg 3 1.1 mrg dnl Copyright 2019 Free Software Foundation, Inc. 4 1.1 mrg 5 1.1 mrg dnl This file is part of the GNU MP Library. 6 1.1 mrg dnl 7 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 1.1 mrg dnl it under the terms of either: 9 1.1 mrg dnl 10 1.1 mrg dnl * the GNU Lesser General Public License as published by the Free 11 1.1 mrg dnl Software Foundation; either version 3 of the License, or (at your 12 1.1 mrg dnl option) any later version. 13 1.1 mrg dnl 14 1.1 mrg dnl or 15 1.1 mrg dnl 16 1.1 mrg dnl * the GNU General Public License as published by the Free Software 17 1.1 mrg dnl Foundation; either version 2 of the License, or (at your option) any 18 1.1 mrg dnl later version. 19 1.1 mrg dnl 20 1.1 mrg dnl or both in parallel, as here. 21 1.1 mrg dnl 22 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 1.1 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 1.1 mrg dnl for more details. 26 1.1 mrg dnl 27 1.1 mrg dnl You should have received copies of the GNU General Public License and the 28 1.1 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 1.1 mrg dnl see https://www.gnu.org/licenses/. 30 1.1 mrg 31 1.1 mrg include(`../config.m4') 32 1.1 mrg 33 1.1 mrg 34 1.1 mrg C cycles/bit 35 1.1 mrg C AMD K8,K9 12.3 36 1.1 mrg C AMD K10 8.0 37 1.1 mrg C AMD bd1 10.0 38 1.1.1.2 mrg C AMD bd2 7.2 39 1.1 mrg C AMD bd3 ? 40 1.1 mrg C AMD bd4 6.7 41 1.1 mrg C AMD bt1 13.6 42 1.1 mrg C AMD bt2 8.9 43 1.1 mrg C AMD zn1 5.7 44 1.1 mrg C AMD zn2 5.6 45 1.1 mrg C Intel P4 ? 46 1.1 mrg C Intel CNR 9.7 47 1.1 mrg C Intel PNR 9.7 48 1.1 mrg C Intel NHM 9.4 49 1.1 mrg C Intel WSM 9.5 50 1.1 mrg C Intel SBR 10.3 51 1.1 mrg C Intel IBR ? 52 1.1 mrg C Intel HWL 8.2 53 1.1 mrg C Intel BWL 7.4 54 1.1 mrg C Intel SKL 7.3 55 1.1 mrg C Intel atom 26.5 56 1.1 mrg C Intel SLM 17.4 57 1.1 mrg C Intel GLM 13.4 58 1.1 mrg C Intel GLM+ 12.4 59 1.1 mrg C VIA nano ? 60 1.1 mrg 61 1.1 mrg 62 1.1 mrg define(`u1', `%rdi') 63 1.1 mrg define(`u0', `%rsi') 64 1.1 mrg define(`v1', `%rdx') 65 1.1 mrg define(`v0_param', `%rcx') 66 1.1 mrg 67 1.1 mrg define(`v0', `%rax') 68 1.1 mrg define(`cnt', `%rcx') 69 1.1 mrg 70 1.1 mrg define(`s0', `%r8') 71 1.1 mrg define(`s1', `%r9') 72 1.1 mrg define(`t0', `%r10') 73 1.1 mrg define(`t1', `%r11') 74 1.1 mrg 75 1.1 mrg dnl ABI_SUPPORT(DOS64) C returns mp_double_limb_t in memory 76 1.1 mrg ABI_SUPPORT(STD64) 77 1.1 mrg 78 1.1 mrg ASM_START() 79 1.1 mrg TEXT 80 1.1 mrg ALIGN(64) 81 1.1 mrg PROLOGUE(mpn_gcd_22) 82 1.1 mrg FUNC_ENTRY(4) 83 1.1 mrg mov v0_param, v0 84 1.1 mrg 85 1.1 mrg ALIGN(16) 86 1.1 mrg L(top): mov v0, t0 87 1.1 mrg sub u0, t0 88 1.1 mrg jz L(lowz) C jump when low limb result = 0 89 1.1 mrg mov v1, t1 90 1.1 mrg sbb u1, t1 91 1.1 mrg 92 1.1 mrg rep;bsf t0, cnt C tzcnt! 93 1.1 mrg mov u0, s0 94 1.1 mrg mov u1, s1 95 1.1 mrg 96 1.1 mrg sub v0, u0 97 1.1 mrg sbb v1, u1 98 1.1 mrg 99 1.1 mrg L(bck): cmovc t0, u0 C u = |u - v| 100 1.1 mrg cmovc t1, u1 C u = |u - v| 101 1.1 mrg cmovc s0, v0 C v = min(u,v) 102 1.1 mrg cmovc s1, v1 C v = min(u,v) 103 1.1 mrg 104 1.1 mrg C Rightshift (u1,,u0) into (u1,,u0) 105 1.1 mrg L(shr): shr R8(cnt), u0 106 1.1 mrg mov u1, t1 107 1.1 mrg shr R8(cnt), u1 108 1.1 mrg neg cnt 109 1.1 mrg shl R8(cnt), t1 110 1.1 mrg or t1, u0 111 1.1 mrg 112 1.1 mrg test v1, v1 113 1.1 mrg jnz L(top) 114 1.1 mrg test u1, u1 115 1.1 mrg jnz L(top) 116 1.1 mrg 117 1.1 mrg L(gcd_11): 118 1.1 mrg mov v0, %rdi 119 1.1 mrg C mov u0, %rsi 120 1.1 mrg TCALL( mpn_gcd_11) 121 1.1 mrg 122 1.1 mrg L(lowz):C We come here when v0 - u0 = 0 123 1.1 mrg C 1. If v1 - u1 = 0, then gcd is u = v. 124 1.1 mrg C 2. Else compute gcd_21({v1,v0}, |u1-v1|) 125 1.1 mrg mov v1, t0 126 1.1 mrg sub u1, t0 127 1.1 mrg je L(end) 128 1.1 mrg 129 1.1 mrg xor t1, t1 130 1.1 mrg rep;bsf t0, cnt C tzcnt! 131 1.1 mrg mov u0, s0 132 1.1 mrg mov u1, s1 133 1.1 mrg mov u1, u0 134 1.1 mrg xor u1, u1 135 1.1 mrg sub v1, u0 136 1.1 mrg jmp L(bck) 137 1.1 mrg 138 1.1 mrg L(end): C mov v0, %rax 139 1.1 mrg C mov v1, %rdx 140 1.1 mrg FUNC_EXIT() 141 1.1 mrg ret 142 1.1 mrg EPILOGUE() 143