1 1.1 mrg /* UltraSPARC 64 mpn_mod_1 -- mpn by limb remainder. 2 1.1 mrg 3 1.1.1.3 mrg Copyright 1991, 1993, 1994, 1999-2001, 2003, 2010 Free Software Foundation, 4 1.1.1.3 mrg Inc. 5 1.1 mrg 6 1.1 mrg This file is part of the GNU MP Library. 7 1.1 mrg 8 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify 9 1.1.1.3 mrg it under the terms of either: 10 1.1.1.3 mrg 11 1.1.1.3 mrg * the GNU Lesser General Public License as published by the Free 12 1.1.1.3 mrg Software Foundation; either version 3 of the License, or (at your 13 1.1.1.3 mrg option) any later version. 14 1.1.1.3 mrg 15 1.1.1.3 mrg or 16 1.1.1.3 mrg 17 1.1.1.3 mrg * the GNU General Public License as published by the Free Software 18 1.1.1.3 mrg Foundation; either version 2 of the License, or (at your option) any 19 1.1.1.3 mrg later version. 20 1.1.1.3 mrg 21 1.1.1.3 mrg or both in parallel, as here. 22 1.1 mrg 23 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but 24 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 1.1.1.3 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 1.1.1.3 mrg for more details. 27 1.1 mrg 28 1.1.1.3 mrg You should have received copies of the GNU General Public License and the 29 1.1.1.3 mrg GNU Lesser General Public License along with the GNU MP Library. If not, 30 1.1.1.3 mrg see https://www.gnu.org/licenses/. */ 31 1.1 mrg 32 1.1 mrg #include "gmp-impl.h" 33 1.1 mrg #include "longlong.h" 34 1.1 mrg 35 1.1 mrg #include "mpn/sparc64/sparc64.h" 36 1.1 mrg 37 1.1 mrg 38 1.1 mrg /* 64-bit divisor 32-bit divisor 39 1.1 mrg cycles/limb cycles/limb 40 1.1 mrg (approx) (approx) 41 1.1 mrg Ultrasparc 2i: 160 120 42 1.1 mrg */ 43 1.1 mrg 44 1.1 mrg 45 1.1 mrg /* 32-bit divisors are treated in special case code. This requires 4 mulx 46 1.1 mrg per limb instead of 8 in the general case. 47 1.1 mrg 48 1.1 mrg For big endian systems we need HALF_ENDIAN_ADJ included in the src[i] 49 1.1 mrg addressing, to get the two halves of each limb read in the correct order. 50 1.1 mrg This is kept in an adj variable. Doing that measures about 6 c/l faster 51 1.1 mrg than just writing HALF_ENDIAN_ADJ(i) in the loop. The latter shouldn't 52 1.1 mrg be 6 cycles worth of work, but perhaps it doesn't schedule well (on gcc 53 1.1 mrg 3.2.1 at least). 54 1.1 mrg 55 1.1 mrg A simple udivx/umulx loop for the 32-bit case was attempted for small 56 1.1 mrg sizes, but at size==2 it was only about the same speed and at size==3 was 57 1.1 mrg slower. */ 58 1.1 mrg 59 1.1.1.2 mrg static mp_limb_t 60 1.1.1.2 mrg mpn_mod_1_anynorm (mp_srcptr src_limbptr, mp_size_t size_limbs, mp_limb_t d_limb) 61 1.1 mrg { 62 1.1 mrg int norm, norm_rshift; 63 1.1 mrg mp_limb_t src_high_limb; 64 1.1 mrg mp_size_t i; 65 1.1 mrg 66 1.1 mrg ASSERT (size_limbs >= 0); 67 1.1 mrg ASSERT (d_limb != 0); 68 1.1 mrg 69 1.1 mrg if (UNLIKELY (size_limbs == 0)) 70 1.1 mrg return 0; 71 1.1 mrg 72 1.1 mrg src_high_limb = src_limbptr[size_limbs-1]; 73 1.1 mrg 74 1.1 mrg /* udivx is good for size==1, and no need to bother checking limb<divisor, 75 1.1 mrg since if that's likely the caller should check */ 76 1.1 mrg if (UNLIKELY (size_limbs == 1)) 77 1.1 mrg return src_high_limb % d_limb; 78 1.1 mrg 79 1.1 mrg if (d_limb <= CNST_LIMB(0xFFFFFFFF)) 80 1.1 mrg { 81 1.1 mrg unsigned *src, n1, n0, r, dummy_q, nshift, norm_rmask; 82 1.1 mrg mp_size_t size, adj; 83 1.1 mrg mp_limb_t dinv_limb; 84 1.1 mrg 85 1.1 mrg size = 2 * size_limbs; /* halfwords */ 86 1.1 mrg src = (unsigned *) src_limbptr; 87 1.1 mrg 88 1.1 mrg /* prospective initial remainder, if < d */ 89 1.1 mrg r = src_high_limb >> 32; 90 1.1 mrg 91 1.1 mrg /* If the length of the source is uniformly distributed, then there's 92 1.1 mrg a 50% chance of the high 32-bits being zero, which we can skip. */ 93 1.1 mrg if (r == 0) 94 1.1 mrg { 95 1.1 mrg r = (unsigned) src_high_limb; 96 1.1 mrg size--; 97 1.1 mrg ASSERT (size > 0); /* because always even */ 98 1.1 mrg } 99 1.1 mrg 100 1.1 mrg /* Skip a division if high < divisor. Having the test here before 101 1.1 mrg normalizing will still skip as often as possible. */ 102 1.1 mrg if (r < d_limb) 103 1.1 mrg { 104 1.1 mrg size--; 105 1.1 mrg ASSERT (size > 0); /* because size==1 handled above */ 106 1.1 mrg } 107 1.1 mrg else 108 1.1 mrg r = 0; 109 1.1 mrg 110 1.1 mrg count_leading_zeros_32 (norm, d_limb); 111 1.1 mrg norm -= 32; 112 1.1 mrg d_limb <<= norm; 113 1.1 mrg 114 1.1 mrg norm_rshift = 32 - norm; 115 1.1 mrg norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF); 116 1.1 mrg i = size-1; 117 1.1 mrg adj = HALF_ENDIAN_ADJ (i); 118 1.1 mrg n1 = src [i + adj]; 119 1.1 mrg r = (r << norm) | ((n1 >> norm_rshift) & norm_rmask); 120 1.1 mrg 121 1.1 mrg invert_half_limb (dinv_limb, d_limb); 122 1.1 mrg adj = -adj; 123 1.1 mrg 124 1.1 mrg for (i--; i >= 0; i--) 125 1.1 mrg { 126 1.1 mrg n0 = src [i + adj]; 127 1.1 mrg adj = -adj; 128 1.1 mrg nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask); 129 1.1 mrg udiv_qrnnd_half_preinv (dummy_q, r, r, nshift, d_limb, dinv_limb); 130 1.1 mrg n1 = n0; 131 1.1 mrg } 132 1.1 mrg 133 1.1 mrg /* same as loop, but without n0 */ 134 1.1 mrg nshift = n1 << norm; 135 1.1 mrg udiv_qrnnd_half_preinv (dummy_q, r, r, nshift, d_limb, dinv_limb); 136 1.1 mrg 137 1.1 mrg ASSERT ((r & ((1 << norm) - 1)) == 0); 138 1.1 mrg return r >> norm; 139 1.1 mrg } 140 1.1 mrg else 141 1.1 mrg { 142 1.1 mrg mp_srcptr src; 143 1.1 mrg mp_size_t size; 144 1.1 mrg mp_limb_t n1, n0, r, dinv, dummy_q, nshift, norm_rmask; 145 1.1 mrg 146 1.1 mrg src = src_limbptr; 147 1.1 mrg size = size_limbs; 148 1.1 mrg r = src_high_limb; /* initial remainder */ 149 1.1 mrg 150 1.1 mrg /* Skip a division if high < divisor. Having the test here before 151 1.1 mrg normalizing will still skip as often as possible. */ 152 1.1 mrg if (r < d_limb) 153 1.1 mrg { 154 1.1 mrg size--; 155 1.1 mrg ASSERT (size > 0); /* because size==1 handled above */ 156 1.1 mrg } 157 1.1 mrg else 158 1.1 mrg r = 0; 159 1.1 mrg 160 1.1 mrg count_leading_zeros (norm, d_limb); 161 1.1 mrg d_limb <<= norm; 162 1.1 mrg 163 1.1 mrg norm_rshift = GMP_LIMB_BITS - norm; 164 1.1 mrg norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF); 165 1.1 mrg 166 1.1 mrg src += size; 167 1.1 mrg n1 = *--src; 168 1.1 mrg r = (r << norm) | ((n1 >> norm_rshift) & norm_rmask); 169 1.1 mrg 170 1.1 mrg invert_limb (dinv, d_limb); 171 1.1 mrg 172 1.1 mrg for (i = size-2; i >= 0; i--) 173 1.1 mrg { 174 1.1 mrg n0 = *--src; 175 1.1 mrg nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask); 176 1.1 mrg udiv_qrnnd_preinv (dummy_q, r, r, nshift, d_limb, dinv); 177 1.1 mrg n1 = n0; 178 1.1 mrg } 179 1.1 mrg 180 1.1 mrg /* same as loop, but without n0 */ 181 1.1 mrg nshift = n1 << norm; 182 1.1 mrg udiv_qrnnd_preinv (dummy_q, r, r, nshift, d_limb, dinv); 183 1.1 mrg 184 1.1 mrg ASSERT ((r & ((CNST_LIMB(1) << norm) - 1)) == 0); 185 1.1 mrg return r >> norm; 186 1.1 mrg } 187 1.1 mrg } 188 1.1.1.2 mrg 189 1.1.1.2 mrg mp_limb_t 190 1.1.1.2 mrg mpn_mod_1 (mp_srcptr ap, mp_size_t n, mp_limb_t b) 191 1.1.1.2 mrg { 192 1.1.1.2 mrg ASSERT (n >= 0); 193 1.1.1.2 mrg ASSERT (b != 0); 194 1.1.1.2 mrg 195 1.1.1.2 mrg /* Should this be handled at all? Rely on callers? Note un==0 is currently 196 1.1.1.2 mrg required by mpz/fdiv_r_ui.c and possibly other places. */ 197 1.1.1.2 mrg if (n == 0) 198 1.1.1.2 mrg return 0; 199 1.1.1.2 mrg 200 1.1.1.2 mrg if (UNLIKELY ((b & GMP_NUMB_HIGHBIT) != 0)) 201 1.1.1.2 mrg { 202 1.1.1.2 mrg if (BELOW_THRESHOLD (n, MOD_1N_TO_MOD_1_1_THRESHOLD)) 203 1.1.1.2 mrg { 204 1.1.1.2 mrg return mpn_mod_1_anynorm (ap, n, b); 205 1.1.1.2 mrg } 206 1.1.1.2 mrg else 207 1.1.1.2 mrg { 208 1.1.1.2 mrg mp_limb_t pre[4]; 209 1.1.1.2 mrg mpn_mod_1_1p_cps (pre, b); 210 1.1.1.2 mrg return mpn_mod_1_1p (ap, n, b, pre); 211 1.1.1.2 mrg } 212 1.1.1.2 mrg } 213 1.1.1.2 mrg else 214 1.1.1.2 mrg { 215 1.1.1.2 mrg if (BELOW_THRESHOLD (n, MOD_1U_TO_MOD_1_1_THRESHOLD)) 216 1.1.1.2 mrg { 217 1.1.1.2 mrg return mpn_mod_1_anynorm (ap, n, b); 218 1.1.1.2 mrg } 219 1.1.1.2 mrg else if (BELOW_THRESHOLD (n, MOD_1_1_TO_MOD_1_2_THRESHOLD)) 220 1.1.1.2 mrg { 221 1.1.1.2 mrg mp_limb_t pre[4]; 222 1.1.1.2 mrg mpn_mod_1_1p_cps (pre, b); 223 1.1.1.2 mrg return mpn_mod_1_1p (ap, n, b << pre[1], pre); 224 1.1.1.2 mrg } 225 1.1.1.2 mrg else if (BELOW_THRESHOLD (n, MOD_1_2_TO_MOD_1_4_THRESHOLD) || UNLIKELY (b > GMP_NUMB_MASK / 4)) 226 1.1.1.2 mrg { 227 1.1.1.2 mrg mp_limb_t pre[5]; 228 1.1.1.2 mrg mpn_mod_1s_2p_cps (pre, b); 229 1.1.1.2 mrg return mpn_mod_1s_2p (ap, n, b << pre[1], pre); 230 1.1.1.2 mrg } 231 1.1.1.2 mrg else 232 1.1.1.2 mrg { 233 1.1.1.2 mrg mp_limb_t pre[7]; 234 1.1.1.2 mrg mpn_mod_1s_4p_cps (pre, b); 235 1.1.1.2 mrg return mpn_mod_1s_4p (ap, n, b << pre[1], pre); 236 1.1.1.2 mrg } 237 1.1.1.2 mrg } 238 1.1.1.2 mrg } 239