1 1.1 mrg dnl SPARC v9 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and 2 1.1 mrg dnl store difference in a third limb vector. 3 1.1 mrg 4 1.1.1.2 mrg dnl Copyright 2001-2003, 2011 Free Software Foundation, Inc. 5 1.1 mrg 6 1.1 mrg dnl This file is part of the GNU MP Library. 7 1.1.1.2 mrg dnl 8 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 9 1.1.1.2 mrg dnl it under the terms of either: 10 1.1.1.2 mrg dnl 11 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free 12 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your 13 1.1.1.2 mrg dnl option) any later version. 14 1.1.1.2 mrg dnl 15 1.1.1.2 mrg dnl or 16 1.1.1.2 mrg dnl 17 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software 18 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any 19 1.1.1.2 mrg dnl later version. 20 1.1.1.2 mrg dnl 21 1.1.1.2 mrg dnl or both in parallel, as here. 22 1.1.1.2 mrg dnl 23 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 24 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 1.1.1.2 mrg dnl for more details. 27 1.1.1.2 mrg dnl 28 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the 29 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/. 31 1.1 mrg 32 1.1 mrg include(`../config.m4') 33 1.1 mrg 34 1.1 mrg C cycles/limb 35 1.1 mrg C UltraSPARC 1&2: 4 36 1.1 mrg C UltraSPARC 3: 4.5 37 1.1 mrg 38 1.1 mrg C Compute carry-out from the most significant bits of u,v, and r, where 39 1.1 mrg C r=u-v-carry_in, using logic operations. 40 1.1 mrg 41 1.1 mrg C This code runs at 4 cycles/limb on UltraSPARC 1 and 2. It has a 4 insn 42 1.1 mrg C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated. 43 1.1 mrg C Therefore, it seems futile to try to optimize this any further... 44 1.1 mrg 45 1.1 mrg C INPUT PARAMETERS 46 1.1 mrg define(`rp',`%i0') 47 1.1 mrg define(`up',`%i1') 48 1.1 mrg define(`vp',`%i2') 49 1.1 mrg define(`n',`%i3') 50 1.1 mrg 51 1.1 mrg define(`u0',`%l0') 52 1.1 mrg define(`u1',`%l2') 53 1.1 mrg define(`u2',`%l4') 54 1.1 mrg define(`u3',`%l6') 55 1.1 mrg define(`v0',`%l1') 56 1.1 mrg define(`v1',`%l3') 57 1.1 mrg define(`v2',`%l5') 58 1.1 mrg define(`v3',`%l7') 59 1.1 mrg 60 1.1 mrg define(`cy',`%i4') 61 1.1 mrg 62 1.1 mrg define(`fanop',`fitod %f0,%f2') dnl A quasi nop running in the FA pipe 63 1.1 mrg define(`fmnop',`fmuld %f0,%f0,%f4') dnl A quasi nop running in the FM pipe 64 1.1 mrg 65 1.1 mrg ASM_START() 66 1.1 mrg REGISTER(%g2,#scratch) 67 1.1 mrg REGISTER(%g3,#scratch) 68 1.1 mrg PROLOGUE(mpn_sub_nc) 69 1.1 mrg save %sp,-160,%sp 70 1.1 mrg 71 1.1 mrg fitod %f0,%f0 C make sure f0 contains small, quiet number 72 1.1 mrg subcc n,4,%g0 73 1.1 mrg bl,pn %xcc,.Loop0 74 1.1 mrg nop 75 1.1 mrg b,a L(com) 76 1.1 mrg EPILOGUE() 77 1.1 mrg 78 1.1 mrg PROLOGUE(mpn_sub_n) 79 1.1 mrg save %sp,-160,%sp 80 1.1 mrg 81 1.1 mrg fitod %f0,%f0 C make sure f0 contains small, quiet number 82 1.1 mrg subcc n,4,%g0 83 1.1 mrg bl,pn %xcc,.Loop0 84 1.1 mrg mov 0,cy 85 1.1 mrg L(com): 86 1.1 mrg ldx [up+0],u0 87 1.1 mrg ldx [vp+0],v0 88 1.1 mrg add up,32,up 89 1.1 mrg ldx [up-24],u1 90 1.1 mrg ldx [vp+8],v1 91 1.1 mrg add vp,32,vp 92 1.1 mrg ldx [up-16],u2 93 1.1 mrg ldx [vp-16],v2 94 1.1 mrg ldx [up-8],u3 95 1.1 mrg ldx [vp-8],v3 96 1.1 mrg subcc n,8,n 97 1.1 mrg sub u0,v0,%g1 C main sub 98 1.1.1.2 mrg sub %g1,cy,%g5 C carry sub 99 1.1 mrg orn u0,v0,%g2 100 1.1 mrg bl,pn %xcc,.Lend4567 101 1.1 mrg fanop 102 1.1 mrg b,a .Loop 103 1.1 mrg 104 1.1 mrg .align 16 105 1.1 mrg C START MAIN LOOP 106 1.1.1.2 mrg .Loop: orn %g5,%g2,%g2 107 1.1 mrg andn u0,v0,%g3 108 1.1 mrg ldx [up+0],u0 109 1.1 mrg fanop 110 1.1 mrg C -- 111 1.1 mrg andn %g2,%g3,%g2 112 1.1 mrg ldx [vp+0],v0 113 1.1 mrg add up,32,up 114 1.1 mrg fanop 115 1.1 mrg C -- 116 1.1 mrg srlx %g2,63,cy 117 1.1 mrg sub u1,v1,%g1 118 1.1.1.2 mrg stx %g5,[rp+0] 119 1.1 mrg fanop 120 1.1 mrg C -- 121 1.1.1.2 mrg sub %g1,cy,%g5 122 1.1 mrg orn u1,v1,%g2 123 1.1 mrg fmnop 124 1.1 mrg fanop 125 1.1 mrg C -- 126 1.1.1.2 mrg orn %g5,%g2,%g2 127 1.1 mrg andn u1,v1,%g3 128 1.1 mrg ldx [up-24],u1 129 1.1 mrg fanop 130 1.1 mrg C -- 131 1.1 mrg andn %g2,%g3,%g2 132 1.1 mrg ldx [vp+8],v1 133 1.1 mrg add vp,32,vp 134 1.1 mrg fanop 135 1.1 mrg C -- 136 1.1 mrg srlx %g2,63,cy 137 1.1 mrg sub u2,v2,%g1 138 1.1.1.2 mrg stx %g5,[rp+8] 139 1.1 mrg fanop 140 1.1 mrg C -- 141 1.1.1.2 mrg sub %g1,cy,%g5 142 1.1 mrg orn u2,v2,%g2 143 1.1 mrg fmnop 144 1.1 mrg fanop 145 1.1 mrg C -- 146 1.1.1.2 mrg orn %g5,%g2,%g2 147 1.1 mrg andn u2,v2,%g3 148 1.1 mrg ldx [up-16],u2 149 1.1 mrg fanop 150 1.1 mrg C -- 151 1.1 mrg andn %g2,%g3,%g2 152 1.1 mrg ldx [vp-16],v2 153 1.1 mrg add rp,32,rp 154 1.1 mrg fanop 155 1.1 mrg C -- 156 1.1 mrg srlx %g2,63,cy 157 1.1 mrg sub u3,v3,%g1 158 1.1.1.2 mrg stx %g5,[rp-16] 159 1.1 mrg fanop 160 1.1 mrg C -- 161 1.1.1.2 mrg sub %g1,cy,%g5 162 1.1 mrg orn u3,v3,%g2 163 1.1 mrg fmnop 164 1.1 mrg fanop 165 1.1 mrg C -- 166 1.1.1.2 mrg orn %g5,%g2,%g2 167 1.1 mrg andn u3,v3,%g3 168 1.1 mrg ldx [up-8],u3 169 1.1 mrg fanop 170 1.1 mrg C -- 171 1.1 mrg andn %g2,%g3,%g2 172 1.1 mrg subcc n,4,n 173 1.1 mrg ldx [vp-8],v3 174 1.1 mrg fanop 175 1.1 mrg C -- 176 1.1 mrg srlx %g2,63,cy 177 1.1 mrg sub u0,v0,%g1 178 1.1.1.2 mrg stx %g5,[rp-8] 179 1.1 mrg fanop 180 1.1 mrg C -- 181 1.1.1.2 mrg sub %g1,cy,%g5 182 1.1 mrg orn u0,v0,%g2 183 1.1 mrg bge,pt %xcc,.Loop 184 1.1 mrg fanop 185 1.1 mrg C END MAIN LOOP 186 1.1 mrg .Lend4567: 187 1.1.1.2 mrg orn %g5,%g2,%g2 188 1.1 mrg andn u0,v0,%g3 189 1.1 mrg andn %g2,%g3,%g2 190 1.1 mrg srlx %g2,63,cy 191 1.1 mrg sub u1,v1,%g1 192 1.1.1.2 mrg stx %g5,[rp+0] 193 1.1.1.2 mrg sub %g1,cy,%g5 194 1.1 mrg orn u1,v1,%g2 195 1.1.1.2 mrg orn %g5,%g2,%g2 196 1.1 mrg andn u1,v1,%g3 197 1.1 mrg andn %g2,%g3,%g2 198 1.1 mrg srlx %g2,63,cy 199 1.1 mrg sub u2,v2,%g1 200 1.1.1.2 mrg stx %g5,[rp+8] 201 1.1.1.2 mrg sub %g1,cy,%g5 202 1.1 mrg orn u2,v2,%g2 203 1.1.1.2 mrg orn %g5,%g2,%g2 204 1.1 mrg andn u2,v2,%g3 205 1.1 mrg andn %g2,%g3,%g2 206 1.1 mrg add rp,32,rp 207 1.1 mrg srlx %g2,63,cy 208 1.1 mrg sub u3,v3,%g1 209 1.1.1.2 mrg stx %g5,[rp-16] 210 1.1.1.2 mrg sub %g1,cy,%g5 211 1.1 mrg orn u3,v3,%g2 212 1.1.1.2 mrg orn %g5,%g2,%g2 213 1.1 mrg andn u3,v3,%g3 214 1.1 mrg andn %g2,%g3,%g2 215 1.1 mrg srlx %g2,63,cy 216 1.1.1.2 mrg stx %g5,[rp-8] 217 1.1 mrg 218 1.1 mrg addcc n,4,n 219 1.1 mrg bz,pn %xcc,.Lret 220 1.1 mrg fanop 221 1.1 mrg 222 1.1 mrg .Loop0: ldx [up],u0 223 1.1 mrg add up,8,up 224 1.1 mrg ldx [vp],v0 225 1.1 mrg add vp,8,vp 226 1.1 mrg add rp,8,rp 227 1.1 mrg subcc n,1,n 228 1.1 mrg sub u0,v0,%g1 229 1.1 mrg orn u0,v0,%g2 230 1.1.1.2 mrg sub %g1,cy,%g5 231 1.1 mrg andn u0,v0,%g3 232 1.1.1.2 mrg orn %g5,%g2,%g2 233 1.1.1.2 mrg stx %g5,[rp-8] 234 1.1 mrg andn %g2,%g3,%g2 235 1.1 mrg bnz,pt %xcc,.Loop0 236 1.1 mrg srlx %g2,63,cy 237 1.1 mrg 238 1.1 mrg .Lret: mov cy,%i0 239 1.1 mrg ret 240 1.1 mrg restore 241 1.1 mrg EPILOGUE(mpn_sub_n) 242