Home | History | Annotate | Line # | Download | only in generic
      1 /* mpn_gcdext -- Extended Greatest Common Divisor.
      2 
      3 Copyright 1996, 1998, 2000-2005, 2008, 2009, 2012 Free Software Foundation,
      4 Inc.
      5 
      6 This file is part of the GNU MP Library.
      7 
      8 The GNU MP Library is free software; you can redistribute it and/or modify
      9 it under the terms of either:
     10 
     11   * the GNU Lesser General Public License as published by the Free
     12     Software Foundation; either version 3 of the License, or (at your
     13     option) any later version.
     14 
     15 or
     16 
     17   * the GNU General Public License as published by the Free Software
     18     Foundation; either version 2 of the License, or (at your option) any
     19     later version.
     20 
     21 or both in parallel, as here.
     22 
     23 The GNU MP Library is distributed in the hope that it will be useful, but
     24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     25 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     26 for more details.
     27 
     28 You should have received copies of the GNU General Public License and the
     29 GNU Lesser General Public License along with the GNU MP Library.  If not,
     30 see https://www.gnu.org/licenses/.  */
     31 
     32 #include "gmp-impl.h"
     33 #include "longlong.h"
     34 
     35 /* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and
     36    the size is returned (if inputs are non-normalized, result may be
     37    non-normalized too). Temporary space needed is M->n + n.
     38  */
     39 static size_t
     40 hgcd_mul_matrix_vector (struct hgcd_matrix *M,
     41 			mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
     42 {
     43   mp_limb_t ah, bh;
     44 
     45   /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
     46 
     47      t  = u00 * a
     48      r  = u10 * b
     49      r += t;
     50 
     51      t  = u11 * b
     52      b  = u01 * a
     53      b += t;
     54   */
     55 
     56   if (M->n >= n)
     57     {
     58       mpn_mul (tp, M->p[0][0], M->n, ap, n);
     59       mpn_mul (rp, M->p[1][0], M->n, bp, n);
     60     }
     61   else
     62     {
     63       mpn_mul (tp, ap, n, M->p[0][0], M->n);
     64       mpn_mul (rp, bp, n, M->p[1][0], M->n);
     65     }
     66 
     67   ah = mpn_add_n (rp, rp, tp, n + M->n);
     68 
     69   if (M->n >= n)
     70     {
     71       mpn_mul (tp, M->p[1][1], M->n, bp, n);
     72       mpn_mul (bp, M->p[0][1], M->n, ap, n);
     73     }
     74   else
     75     {
     76       mpn_mul (tp, bp, n, M->p[1][1], M->n);
     77       mpn_mul (bp, ap, n, M->p[0][1], M->n);
     78     }
     79   bh = mpn_add_n (bp, bp, tp, n + M->n);
     80 
     81   n += M->n;
     82   if ( (ah | bh) > 0)
     83     {
     84       rp[n] = ah;
     85       bp[n] = bh;
     86       n++;
     87     }
     88   else
     89     {
     90       /* Normalize */
     91       while ( (rp[n-1] | bp[n-1]) == 0)
     92 	n--;
     93     }
     94 
     95   return n;
     96 }
     97 
     98 #define COMPUTE_V_ITCH(n) (2*(n))
     99 
    100 /* Computes |v| = |(g - u a)| / b, where u may be positive or
    101    negative, and v is of the opposite sign. max(a, b) is of size n, u and
    102    v at most size n, and v must have space for n+1 limbs. */
    103 static mp_size_t
    104 compute_v (mp_ptr vp,
    105 	   mp_srcptr ap, mp_srcptr bp, mp_size_t n,
    106 	   mp_srcptr gp, mp_size_t gn,
    107 	   mp_srcptr up, mp_size_t usize,
    108 	   mp_ptr tp)
    109 {
    110   mp_size_t size;
    111   mp_size_t an;
    112   mp_size_t bn;
    113   mp_size_t vn;
    114 
    115   ASSERT (n > 0);
    116   ASSERT (gn > 0);
    117   ASSERT (usize != 0);
    118 
    119   size = ABS (usize);
    120   ASSERT (size <= n);
    121   ASSERT (up[size-1] > 0);
    122 
    123   an = n;
    124   MPN_NORMALIZE (ap, an);
    125   ASSERT (gn <= an);
    126 
    127   if (an >= size)
    128     mpn_mul (tp, ap, an, up, size);
    129   else
    130     mpn_mul (tp, up, size, ap, an);
    131 
    132   size += an;
    133 
    134   if (usize > 0)
    135     {
    136       /* |v| = -v = (u a - g) / b */
    137 
    138       ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn));
    139       MPN_NORMALIZE (tp, size);
    140       if (size == 0)
    141 	return 0;
    142     }
    143   else
    144     { /* |v| = v = (g - u a) / b = (g + |u| a) / b. Since g <= a,
    145 	 (g + |u| a) always fits in (|usize| + an) limbs. */
    146 
    147       ASSERT_NOCARRY (mpn_add (tp, tp, size, gp, gn));
    148       size -= (tp[size - 1] == 0);
    149     }
    150 
    151   /* Now divide t / b. There must be no remainder */
    152   bn = n;
    153   MPN_NORMALIZE (bp, bn);
    154   ASSERT (size >= bn);
    155 
    156   vn = size + 1 - bn;
    157   ASSERT (vn <= n + 1);
    158 
    159   mpn_divexact (vp, tp, size, bp, bn);
    160   vn -= (vp[vn-1] == 0);
    161 
    162   return vn;
    163 }
    164 
    165 /* Temporary storage:
    166 
    167    Initial division: Quotient of at most an - n + 1 <= an limbs.
    168 
    169    Storage for u0 and u1: 2(n+1).
    170 
    171    Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4)
    172 
    173    Storage for hgcd, input (n + 1)/2: 9 n/4 plus some.
    174 
    175    When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors.
    176 
    177    When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less.
    178 
    179    For the lehmer call after the loop, Let T denote
    180    GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
    181    u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
    182    for u, T+1 for v and 2T scratch space. In all, 7T + 3 is
    183    sufficient for both operations.
    184 
    185 */
    186 
    187 /* Optimal choice of p seems difficult. In each iteration the division
    188  * of work between hgcd and the updates of u0 and u1 depends on the
    189  * current size of the u. It may be desirable to use a different
    190  * choice of p in each iteration. Also the input size seems to matter;
    191  * choosing p = n / 3 in the first iteration seems to improve
    192  * performance slightly for input size just above the threshold, but
    193  * degrade performance for larger inputs. */
    194 #define CHOOSE_P_1(n) ((n) / 2)
    195 #define CHOOSE_P_2(n) ((n) / 3)
    196 
    197 mp_size_t
    198 mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
    199 	    mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
    200 {
    201   mp_size_t talloc;
    202   mp_size_t scratch;
    203   mp_size_t matrix_scratch;
    204   mp_size_t ualloc = n + 1;
    205 
    206   struct gcdext_ctx ctx;
    207   mp_size_t un;
    208   mp_ptr u0;
    209   mp_ptr u1;
    210 
    211   mp_ptr tp;
    212 
    213   TMP_DECL;
    214 
    215   ASSERT (an >= n);
    216   ASSERT (n > 0);
    217   ASSERT (bp[n-1] > 0);
    218 
    219   TMP_MARK;
    220 
    221   /* FIXME: Check for small sizes first, before setting up temporary
    222      storage etc. */
    223   talloc = MPN_GCDEXT_LEHMER_N_ITCH(n);
    224 
    225   /* For initial division */
    226   scratch = an - n + 1;
    227   if (scratch > talloc)
    228     talloc = scratch;
    229 
    230   if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
    231     {
    232       /* For hgcd loop. */
    233       mp_size_t hgcd_scratch;
    234       mp_size_t update_scratch;
    235       mp_size_t p1 = CHOOSE_P_1 (n);
    236       mp_size_t p2 = CHOOSE_P_2 (n);
    237       mp_size_t min_p = MIN(p1, p2);
    238       mp_size_t max_p = MAX(p1, p2);
    239       matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p);
    240       hgcd_scratch = mpn_hgcd_itch (n - min_p);
    241       update_scratch = max_p + n - 1;
    242 
    243       scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
    244       if (scratch > talloc)
    245 	talloc = scratch;
    246 
    247       /* Final mpn_gcdext_lehmer_n call. Need space for u and for
    248 	 copies of a and b. */
    249       scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD)
    250 	+ 3*GCDEXT_DC_THRESHOLD;
    251 
    252       if (scratch > talloc)
    253 	talloc = scratch;
    254 
    255       /* Cofactors u0 and u1 */
    256       talloc += 2*(n+1);
    257     }
    258 
    259   tp = TMP_ALLOC_LIMBS(talloc);
    260 
    261   if (an > n)
    262     {
    263       mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n);
    264 
    265       if (mpn_zero_p (ap, n))
    266 	{
    267 	  MPN_COPY (gp, bp, n);
    268 	  *usizep = 0;
    269 	  TMP_FREE;
    270 	  return n;
    271 	}
    272     }
    273 
    274   if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
    275     {
    276       mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp);
    277 
    278       TMP_FREE;
    279       return gn;
    280     }
    281 
    282   MPN_ZERO (tp, 2*ualloc);
    283   u0 = tp; tp += ualloc;
    284   u1 = tp; tp += ualloc;
    285 
    286   ctx.gp = gp;
    287   ctx.up = up;
    288   ctx.usize = usizep;
    289 
    290   {
    291     /* For the first hgcd call, there are no u updates, and it makes
    292        some sense to use a different choice for p. */
    293 
    294     /* FIXME: We could trim use of temporary storage, since u0 and u1
    295        are not used yet. For the hgcd call, we could swap in the u0
    296        and u1 pointers for the relevant matrix elements. */
    297 
    298     struct hgcd_matrix M;
    299     mp_size_t p = CHOOSE_P_1 (n);
    300     mp_size_t nn;
    301 
    302     mpn_hgcd_matrix_init (&M, n - p, tp);
    303     nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
    304     if (nn > 0)
    305       {
    306 	ASSERT (M.n <= (n - p - 1)/2);
    307 	ASSERT (M.n + p <= (p + n - 1) / 2);
    308 
    309 	/* Temporary storage 2 (p + M->n) <= p + n - 1 */
    310 	n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
    311 
    312 	MPN_COPY (u0, M.p[1][0], M.n);
    313 	MPN_COPY (u1, M.p[1][1], M.n);
    314 	un = M.n;
    315 	while ( (u0[un-1] | u1[un-1] ) == 0)
    316 	  un--;
    317       }
    318     else
    319       {
    320 	/* mpn_hgcd has failed. Then either one of a or b is very
    321 	   small, or the difference is very small. Perform one
    322 	   subtraction followed by one division. */
    323 	u1[0] = 1;
    324 
    325 	ctx.u0 = u0;
    326 	ctx.u1 = u1;
    327 	ctx.tp = tp + n; /* ualloc */
    328 	ctx.un = 1;
    329 
    330 	/* Temporary storage n */
    331 	n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp);
    332 	if (n == 0)
    333 	  {
    334 	    TMP_FREE;
    335 	    return ctx.gn;
    336 	  }
    337 
    338 	un = ctx.un;
    339 	ASSERT (un < ualloc);
    340       }
    341   }
    342 
    343   while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
    344     {
    345       struct hgcd_matrix M;
    346       mp_size_t p = CHOOSE_P_2 (n);
    347       mp_size_t nn;
    348 
    349       mpn_hgcd_matrix_init (&M, n - p, tp);
    350       nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
    351       if (nn > 0)
    352 	{
    353 	  mp_ptr t0;
    354 
    355 	  t0 = tp + matrix_scratch;
    356 	  ASSERT (M.n <= (n - p - 1)/2);
    357 	  ASSERT (M.n + p <= (p + n - 1) / 2);
    358 
    359 	  /* Temporary storage 2 (p + M->n) <= p + n - 1 */
    360 	  n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0);
    361 
    362 	  /* By the same analysis as for mpn_hgcd_matrix_mul */
    363 	  ASSERT (M.n + un <= ualloc);
    364 
    365 	  /* FIXME: This copying could be avoided by some swapping of
    366 	   * pointers. May need more temporary storage, though. */
    367 	  MPN_COPY (t0, u0, un);
    368 
    369 	  /* Temporary storage ualloc */
    370 	  un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un);
    371 
    372 	  ASSERT (un < ualloc);
    373 	  ASSERT ( (u0[un-1] | u1[un-1]) > 0);
    374 	}
    375       else
    376 	{
    377 	  /* mpn_hgcd has failed. Then either one of a or b is very
    378 	     small, or the difference is very small. Perform one
    379 	     subtraction followed by one division. */
    380 	  ctx.u0 = u0;
    381 	  ctx.u1 = u1;
    382 	  ctx.tp = tp + n; /* ualloc */
    383 	  ctx.un = un;
    384 
    385 	  /* Temporary storage n */
    386 	  n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp);
    387 	  if (n == 0)
    388 	    {
    389 	      TMP_FREE;
    390 	      return ctx.gn;
    391 	    }
    392 
    393 	  un = ctx.un;
    394 	  ASSERT (un < ualloc);
    395 	}
    396     }
    397   /* We have A = ... a + ... b
    398 	     B =  u0 a +  u1 b
    399 
    400 	     a = u1  A + ... B
    401 	     b = -u0 A + ... B
    402 
    403      with bounds
    404 
    405        |u0|, |u1| <= B / min(a, b)
    406 
    407      We always have u1 > 0, and u0 == 0 is possible only if u1 == 1,
    408      in which case the only reduction done so far is a = A - k B for
    409      some k.
    410 
    411      Compute g = u a + v b = (u u1 - v u0) A + (...) B
    412      Here, u, v are bounded by
    413 
    414        |u| <= b,
    415        |v| <= a
    416   */
    417 
    418   ASSERT ( (ap[n-1] | bp[n-1]) > 0);
    419 
    420   if (UNLIKELY (mpn_cmp (ap, bp, n) == 0))
    421     {
    422       /* Must return the smallest cofactor, +u1 or -u0 */
    423       int c;
    424 
    425       MPN_COPY (gp, ap, n);
    426 
    427       MPN_CMP (c, u0, u1, un);
    428       /* c == 0 can happen only when A = (2k+1) G, B = 2 G. And in
    429 	 this case we choose the cofactor + 1, corresponding to G = A
    430 	 - k B, rather than -1, corresponding to G = - A + (k+1) B. */
    431       ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1));
    432       if (c < 0)
    433 	{
    434 	  MPN_NORMALIZE (u0, un);
    435 	  MPN_COPY (up, u0, un);
    436 	  *usizep = -un;
    437 	}
    438       else
    439 	{
    440 	  MPN_NORMALIZE_NOT_ZERO (u1, un);
    441 	  MPN_COPY (up, u1, un);
    442 	  *usizep = un;
    443 	}
    444 
    445       TMP_FREE;
    446       return n;
    447     }
    448   else if (UNLIKELY (u0[0] == 0) && un == 1)
    449     {
    450       mp_size_t gn;
    451       ASSERT (u1[0] == 1);
    452 
    453       /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
    454       gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp);
    455 
    456       TMP_FREE;
    457       return gn;
    458     }
    459   else
    460     {
    461       mp_size_t u0n;
    462       mp_size_t u1n;
    463       mp_size_t lehmer_un;
    464       mp_size_t lehmer_vn;
    465       mp_size_t gn;
    466 
    467       mp_ptr lehmer_up;
    468       mp_ptr lehmer_vp;
    469       int negate;
    470 
    471       lehmer_up = tp; tp += n;
    472 
    473       /* Call mpn_gcdext_lehmer_n with copies of a and b. */
    474       MPN_COPY (tp, ap, n);
    475       MPN_COPY (tp + n, bp, n);
    476       gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n);
    477 
    478       u0n = un;
    479       MPN_NORMALIZE (u0, u0n);
    480       ASSERT (u0n > 0);
    481 
    482       if (lehmer_un == 0)
    483 	{
    484 	  /* u == 0  ==>  v = g / b == 1  ==> g = - u0 A + (...) B */
    485 	  MPN_COPY (up, u0, u0n);
    486 	  *usizep = -u0n;
    487 
    488 	  TMP_FREE;
    489 	  return gn;
    490 	}
    491 
    492       lehmer_vp = tp;
    493       /* Compute v = (g - u a) / b */
    494       lehmer_vn = compute_v (lehmer_vp,
    495 			     ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1);
    496 
    497       if (lehmer_un > 0)
    498 	negate = 0;
    499       else
    500 	{
    501 	  lehmer_un = -lehmer_un;
    502 	  negate = 1;
    503 	}
    504 
    505       u1n = un;
    506       MPN_NORMALIZE (u1, u1n);
    507       ASSERT (u1n > 0);
    508 
    509       ASSERT (lehmer_un + u1n <= ualloc);
    510       ASSERT (lehmer_vn + u0n <= ualloc);
    511 
    512       /* We may still have v == 0 */
    513 
    514       /* Compute u u0 */
    515       if (lehmer_un <= u1n)
    516 	/* Should be the common case */
    517 	mpn_mul (up, u1, u1n, lehmer_up, lehmer_un);
    518       else
    519 	mpn_mul (up, lehmer_up, lehmer_un, u1, u1n);
    520 
    521       un = u1n + lehmer_un;
    522       un -= (up[un - 1] == 0);
    523 
    524       if (lehmer_vn > 0)
    525 	{
    526 	  mp_limb_t cy;
    527 
    528 	  /* Overwrites old u1 value */
    529 	  if (lehmer_vn <= u0n)
    530 	    /* Should be the common case */
    531 	    mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn);
    532 	  else
    533 	    mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n);
    534 
    535 	  u1n = u0n + lehmer_vn;
    536 	  u1n -= (u1[u1n - 1] == 0);
    537 
    538 	  if (u1n <= un)
    539 	    {
    540 	      cy = mpn_add (up, up, un, u1, u1n);
    541 	    }
    542 	  else
    543 	    {
    544 	      cy = mpn_add (up, u1, u1n, up, un);
    545 	      un = u1n;
    546 	    }
    547 	  up[un] = cy;
    548 	  un += (cy != 0);
    549 
    550 	  ASSERT (un < ualloc);
    551 	}
    552       *usizep = negate ? -un : un;
    553 
    554       TMP_FREE;
    555       return gn;
    556     }
    557 }
    558