1 1.1 mrg /* UltraSPARC 64 mpn_divexact_1 -- mpn by limb exact division. 2 1.1 mrg 3 1.1 mrg THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST 4 1.1 mrg CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN 5 1.1 mrg FUTURE GNU MP RELEASES. 6 1.1 mrg 7 1.1.1.3 mrg Copyright 2000, 2001, 2003, 2019 Free Software Foundation, Inc. 8 1.1 mrg 9 1.1 mrg This file is part of the GNU MP Library. 10 1.1 mrg 11 1.1 mrg The GNU MP Library is free software; you can redistribute it and/or modify 12 1.1.1.2 mrg it under the terms of either: 13 1.1.1.2 mrg 14 1.1.1.2 mrg * the GNU Lesser General Public License as published by the Free 15 1.1.1.2 mrg Software Foundation; either version 3 of the License, or (at your 16 1.1.1.2 mrg option) any later version. 17 1.1.1.2 mrg 18 1.1.1.2 mrg or 19 1.1.1.2 mrg 20 1.1.1.2 mrg * the GNU General Public License as published by the Free Software 21 1.1.1.2 mrg Foundation; either version 2 of the License, or (at your option) any 22 1.1.1.2 mrg later version. 23 1.1.1.2 mrg 24 1.1.1.2 mrg or both in parallel, as here. 25 1.1 mrg 26 1.1 mrg The GNU MP Library is distributed in the hope that it will be useful, but 27 1.1 mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 28 1.1.1.2 mrg or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 29 1.1.1.2 mrg for more details. 30 1.1 mrg 31 1.1.1.2 mrg You should have received copies of the GNU General Public License and the 32 1.1.1.2 mrg GNU Lesser General Public License along with the GNU MP Library. If not, 33 1.1.1.2 mrg see https://www.gnu.org/licenses/. */ 34 1.1 mrg 35 1.1 mrg #include "gmp-impl.h" 36 1.1 mrg #include "longlong.h" 37 1.1 mrg 38 1.1 mrg #include "mpn/sparc64/sparc64.h" 39 1.1 mrg 40 1.1 mrg 41 1.1 mrg /* 64-bit divisor 32-bit divisor 42 1.1 mrg cycles/limb cycles/limb 43 1.1 mrg (approx) (approx) 44 1.1 mrg Ultrasparc 2i: 110 70 45 1.1 mrg */ 46 1.1 mrg 47 1.1 mrg 48 1.1 mrg /* There are two key ideas here to reduce mulx's. Firstly when the divisor 49 1.1 mrg is 32-bits the high of q*d can be calculated without the two 32x32->64 50 1.1 mrg cross-products involving the high 32-bits of the divisor, that being zero 51 1.1 mrg of course. Secondly umul_ppmm_lowequal and umul_ppmm_half_lowequal save 52 1.1 mrg one mulx (each) knowing the low of q*d is equal to the input limb l. 53 1.1 mrg 54 1.1 mrg For size==1, a simple udivx is used. This is faster than calculating an 55 1.1 mrg inverse. 56 1.1 mrg 57 1.1 mrg For a 32-bit divisor and small sizes, an attempt was made at a simple 58 1.1 mrg udivx loop (two per 64-bit limb), but it turned out to be slower than 59 1.1 mrg mul-by-inverse. At size==2 the inverse is about 260 cycles total 60 1.1 mrg compared to a udivx at 291. Perhaps the latter would suit when size==2 61 1.1 mrg but the high 32-bits of the second limb is zero (saving one udivx), but 62 1.1 mrg it doesn't seem worth a special case just for that. */ 63 1.1 mrg 64 1.1 mrg void 65 1.1 mrg mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor) 66 1.1 mrg { 67 1.1 mrg mp_limb_t inverse, s, s_next, c, l, ls, q; 68 1.1 mrg unsigned rshift, lshift; 69 1.1 mrg mp_limb_t lshift_mask; 70 1.1 mrg mp_limb_t divisor_h; 71 1.1 mrg 72 1.1 mrg ASSERT (size >= 1); 73 1.1 mrg ASSERT (divisor != 0); 74 1.1 mrg ASSERT (MPN_SAME_OR_SEPARATE_P (dst, src, size)); 75 1.1 mrg ASSERT_MPN (src, size); 76 1.1 mrg ASSERT_LIMB (divisor); 77 1.1 mrg 78 1.1 mrg s = *src++; /* src low limb */ 79 1.1 mrg size--; 80 1.1 mrg if (size == 0) 81 1.1 mrg { 82 1.1 mrg *dst = s / divisor; 83 1.1 mrg return; 84 1.1 mrg } 85 1.1 mrg 86 1.1 mrg if ((divisor & 1) == 0) 87 1.1 mrg { 88 1.1 mrg count_trailing_zeros (rshift, divisor); 89 1.1 mrg divisor >>= rshift; 90 1.1.1.3 mrg lshift = 64 - rshift; 91 1.1.1.3 mrg 92 1.1.1.3 mrg lshift_mask = MP_LIMB_T_MAX; 93 1.1 mrg } 94 1.1 mrg else 95 1.1.1.3 mrg { 96 1.1.1.3 mrg rshift = 0; 97 1.1 mrg 98 1.1.1.3 mrg /* rshift==0 means no shift, so must mask out other part in this case */ 99 1.1.1.3 mrg lshift = 0; 100 1.1.1.3 mrg lshift_mask = 0; 101 1.1.1.3 mrg } 102 1.1 mrg 103 1.1.1.3 mrg binvert_limb (inverse, divisor); 104 1.1 mrg 105 1.1 mrg c = 0; 106 1.1 mrg divisor_h = HIGH32 (divisor); 107 1.1 mrg 108 1.1 mrg if (divisor_h == 0) 109 1.1 mrg { 110 1.1 mrg /* 32-bit divisor */ 111 1.1 mrg do 112 1.1 mrg { 113 1.1 mrg s_next = *src++; 114 1.1 mrg ls = (s >> rshift) | ((s_next << lshift) & lshift_mask); 115 1.1 mrg s = s_next; 116 1.1 mrg 117 1.1 mrg SUBC_LIMB (c, l, ls, c); 118 1.1 mrg 119 1.1 mrg q = l * inverse; 120 1.1 mrg *dst++ = q; 121 1.1 mrg 122 1.1 mrg umul_ppmm_half_lowequal (l, q, divisor, l); 123 1.1 mrg c += l; 124 1.1 mrg 125 1.1 mrg size--; 126 1.1 mrg } 127 1.1 mrg while (size != 0); 128 1.1 mrg 129 1.1 mrg ls = s >> rshift; 130 1.1 mrg l = ls - c; 131 1.1 mrg q = l * inverse; 132 1.1 mrg *dst = q; 133 1.1 mrg } 134 1.1 mrg else 135 1.1 mrg { 136 1.1 mrg /* 64-bit divisor */ 137 1.1 mrg mp_limb_t divisor_l = LOW32 (divisor); 138 1.1 mrg do 139 1.1 mrg { 140 1.1 mrg s_next = *src++; 141 1.1 mrg ls = (s >> rshift) | ((s_next << lshift) & lshift_mask); 142 1.1 mrg s = s_next; 143 1.1 mrg 144 1.1 mrg SUBC_LIMB (c, l, ls, c); 145 1.1 mrg 146 1.1 mrg q = l * inverse; 147 1.1 mrg *dst++ = q; 148 1.1 mrg 149 1.1 mrg umul_ppmm_lowequal (l, q, divisor, divisor_h, divisor_l, l); 150 1.1 mrg c += l; 151 1.1 mrg 152 1.1 mrg size--; 153 1.1 mrg } 154 1.1 mrg while (size != 0); 155 1.1 mrg 156 1.1 mrg ls = s >> rshift; 157 1.1 mrg l = ls - c; 158 1.1 mrg q = l * inverse; 159 1.1 mrg *dst = q; 160 1.1 mrg } 161 1.1 mrg } 162