1 1.1 mrg dnl Intel P6 mpn_lshsub_n -- mpn papillion support. 2 1.1 mrg 3 1.1 mrg dnl Copyright 2006 Free Software Foundation, Inc. 4 1.1.1.2 mrg 5 1.1 mrg dnl This file is part of the GNU MP Library. 6 1.1 mrg dnl 7 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 1.1.1.2 mrg dnl it under the terms of either: 9 1.1.1.2 mrg dnl 10 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free 11 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your 12 1.1.1.2 mrg dnl option) any later version. 13 1.1.1.2 mrg dnl 14 1.1.1.2 mrg dnl or 15 1.1.1.2 mrg dnl 16 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software 17 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any 18 1.1.1.2 mrg dnl later version. 19 1.1.1.2 mrg dnl 20 1.1.1.2 mrg dnl or both in parallel, as here. 21 1.1 mrg dnl 22 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 1.1.1.2 mrg dnl for more details. 26 1.1 mrg dnl 27 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the 28 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/. 30 1.1 mrg 31 1.1 mrg include(`../config.m4') 32 1.1 mrg 33 1.1 mrg C P6/13: 3.35 cycles/limb (separate mpn_sub_n + mpn_lshift needs 4.12) 34 1.1 mrg 35 1.1.1.2 mrg C (1) The loop is not scheduled in any way, and scheduling attempts have not 36 1.1 mrg C improved speed on P6/13. Presumably, the K7 will want scheduling, if it 37 1.1 mrg C at all wants to use MMX. 38 1.1 mrg C (2) We could save a register by not alternatingly using eax and edx in the 39 1.1 mrg C loop. 40 1.1 mrg 41 1.1 mrg define(`rp', `%edi') 42 1.1 mrg define(`up', `%esi') 43 1.1 mrg define(`vp', `%ebx') 44 1.1 mrg define(`n', `%ecx') 45 1.1 mrg define(`cnt', `%mm7') 46 1.1 mrg 47 1.1 mrg ASM_START() 48 1.1 mrg 49 1.1 mrg TEXT 50 1.1 mrg ALIGN(16) 51 1.1 mrg 52 1.1 mrg PROLOGUE(mpn_lshsub_n) 53 1.1 mrg push %edi 54 1.1 mrg push %esi 55 1.1 mrg push %ebx 56 1.1 mrg 57 1.1 mrg mov 16(%esp), rp 58 1.1 mrg mov 20(%esp), up 59 1.1 mrg mov 24(%esp), vp 60 1.1 mrg mov 28(%esp), n 61 1.1 mrg mov $32, %eax 62 1.1 mrg sub 32(%esp), %eax 63 1.1 mrg movd %eax, cnt 64 1.1 mrg 65 1.1 mrg lea (up,n,4), up 66 1.1 mrg lea (vp,n,4), vp 67 1.1 mrg lea (rp,n,4), rp 68 1.1 mrg 69 1.1 mrg neg n 70 1.1 mrg mov n, %eax 71 1.1 mrg and $-8, n 72 1.1 mrg and $7, %eax 73 1.1 mrg shl %eax C eax = 2x 74 1.1 mrg lea (%eax,%eax,4), %edx C edx = 10x 75 1.1 mrg ifdef(`PIC',` 76 1.1 mrg call L(pic_calc) 77 1.1 mrg L(here): 78 1.1 mrg ',` 79 1.1 mrg lea L(ent)(%eax,%edx,2), %eax C eax = 22x 80 1.1 mrg ') 81 1.1 mrg 82 1.1 mrg pxor %mm1, %mm1 83 1.1 mrg pxor %mm0, %mm0 84 1.1 mrg 85 1.1 mrg jmp *%eax 86 1.1 mrg 87 1.1 mrg ifdef(`PIC',` 88 1.1 mrg L(pic_calc): 89 1.1 mrg C See mpn/x86/README about old gas bugs 90 1.1 mrg lea (%eax,%edx,2), %eax 91 1.1 mrg add $L(ent)-L(here), %eax 92 1.1 mrg add (%esp), %eax 93 1.1 mrg ret_internal 94 1.1 mrg ') 95 1.1 mrg 96 1.1 mrg L(end): C compute (cy<<cnt) | (edx>>(32-cnt)) 97 1.1 mrg sbb %eax, %eax 98 1.1 mrg neg %eax 99 1.1 mrg mov 32(%esp), %ecx 100 1.1 mrg shld %cl, %edx, %eax 101 1.1 mrg 102 1.1 mrg emms 103 1.1 mrg 104 1.1 mrg pop %ebx 105 1.1 mrg pop %esi 106 1.1 mrg pop %edi 107 1.1 mrg ret 108 1.1 mrg ALIGN(16) 109 1.1 mrg L(top): jecxz L(end) 110 1.1 mrg L(ent): mov 0(up,n,4), %eax 111 1.1 mrg sbb 0(vp,n,4), %eax 112 1.1 mrg movd %eax, %mm0 113 1.1 mrg punpckldq %mm0, %mm1 114 1.1 mrg psrlq %mm7, %mm1 115 1.1 mrg movd %mm1, 0(rp,n,4) 116 1.1 mrg 117 1.1 mrg mov 4(up,n,4), %edx 118 1.1 mrg sbb 4(vp,n,4), %edx 119 1.1 mrg movd %edx, %mm1 120 1.1 mrg punpckldq %mm1, %mm0 121 1.1 mrg psrlq %mm7, %mm0 122 1.1 mrg movd %mm0, 4(rp,n,4) 123 1.1 mrg 124 1.1 mrg mov 8(up,n,4), %eax 125 1.1 mrg sbb 8(vp,n,4), %eax 126 1.1 mrg movd %eax, %mm0 127 1.1 mrg punpckldq %mm0, %mm1 128 1.1 mrg psrlq %mm7, %mm1 129 1.1 mrg movd %mm1, 8(rp,n,4) 130 1.1 mrg 131 1.1 mrg mov 12(up,n,4), %edx 132 1.1 mrg sbb 12(vp,n,4), %edx 133 1.1 mrg movd %edx, %mm1 134 1.1 mrg punpckldq %mm1, %mm0 135 1.1 mrg psrlq %mm7, %mm0 136 1.1 mrg movd %mm0, 12(rp,n,4) 137 1.1 mrg 138 1.1 mrg mov 16(up,n,4), %eax 139 1.1 mrg sbb 16(vp,n,4), %eax 140 1.1 mrg movd %eax, %mm0 141 1.1 mrg punpckldq %mm0, %mm1 142 1.1 mrg psrlq %mm7, %mm1 143 1.1 mrg movd %mm1, 16(rp,n,4) 144 1.1 mrg 145 1.1 mrg mov 20(up,n,4), %edx 146 1.1 mrg sbb 20(vp,n,4), %edx 147 1.1 mrg movd %edx, %mm1 148 1.1 mrg punpckldq %mm1, %mm0 149 1.1 mrg psrlq %mm7, %mm0 150 1.1 mrg movd %mm0, 20(rp,n,4) 151 1.1 mrg 152 1.1 mrg mov 24(up,n,4), %eax 153 1.1 mrg sbb 24(vp,n,4), %eax 154 1.1 mrg movd %eax, %mm0 155 1.1 mrg punpckldq %mm0, %mm1 156 1.1 mrg psrlq %mm7, %mm1 157 1.1 mrg movd %mm1, 24(rp,n,4) 158 1.1 mrg 159 1.1 mrg mov 28(up,n,4), %edx 160 1.1 mrg sbb 28(vp,n,4), %edx 161 1.1 mrg movd %edx, %mm1 162 1.1 mrg punpckldq %mm1, %mm0 163 1.1 mrg psrlq %mm7, %mm0 164 1.1 mrg movd %mm0, 28(rp,n,4) 165 1.1 mrg 166 1.1 mrg lea 8(n), n 167 1.1 mrg jmp L(top) 168 1.1 mrg 169 1.1 mrg EPILOGUE() 170