gmp/dist/gen-psqr.c

    1.1  mrg /* Generate perfect square testing data.
    1.1  mrg
1.1.1.3  mrg Copyright 2002-2004, 2012, 2014 Free Software Foundation, Inc.
    1.1  mrg
    1.1  mrg This file is part of the GNU MP Library.
    1.1  mrg
    1.1  mrg The GNU MP Library is free software; you can redistribute it and/or modify
1.1.1.3  mrg it under the terms of either:
1.1.1.3  mrg
1.1.1.3  mrg   * the GNU Lesser General Public License as published by the Free
1.1.1.3  mrg     Software Foundation; either version 3 of the License, or (at your
1.1.1.3  mrg     option) any later version.
1.1.1.3  mrg
1.1.1.3  mrg or
1.1.1.3  mrg
1.1.1.3  mrg   * the GNU General Public License as published by the Free Software
1.1.1.3  mrg     Foundation; either version 2 of the License, or (at your option) any
1.1.1.3  mrg     later version.
1.1.1.3  mrg
1.1.1.3  mrg or both in parallel, as here.
    1.1  mrg
    1.1  mrg The GNU MP Library is distributed in the hope that it will be useful, but
    1.1  mrg WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
1.1.1.3  mrg or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
1.1.1.3  mrg for more details.
    1.1  mrg
1.1.1.3  mrg You should have received copies of the GNU General Public License and the
1.1.1.3  mrg GNU Lesser General Public License along with the GNU MP Library.  If not,
1.1.1.3  mrg see https://www.gnu.org/licenses/.  */
    1.1  mrg
    1.1  mrg #include <stdio.h>
    1.1  mrg #include <stdlib.h>
    1.1  mrg
1.1.1.2  mrg #include "bootstrap.c"
    1.1  mrg
    1.1  mrg
    1.1  mrg /* The aim of this program is to choose either mpn_mod_34lsub1 or mpn_mod_1
    1.1  mrg    (plus a PERFSQR_PP modulus), and generate tables indicating quadratic
    1.1  mrg    residues and non-residues modulo small factors of that modulus.
    1.1  mrg
    1.1  mrg    For the usual 32 or 64 bit cases mpn_mod_34lsub1 gets used.  That
    1.1  mrg    function exists specifically because 2^24-1 and 2^48-1 have nice sets of
    1.1  mrg    prime factors.  For other limb sizes it's considered, but if it doesn't
    1.1  mrg    have good factors then mpn_mod_1 will be used instead.
    1.1  mrg
    1.1  mrg    When mpn_mod_1 is used, the modulus PERFSQR_PP is created from a
    1.1  mrg    selection of small primes, chosen to fill PERFSQR_MOD_BITS of a limb,
    1.1  mrg    with that bit count chosen so (2*GMP_LIMB_BITS)*2^PERFSQR_MOD_BITS <=
    1.1  mrg    GMP_LIMB_MAX, allowing PERFSQR_MOD_IDX in mpn/generic/perfsqr.c to do its
    1.1  mrg    calculation within a single limb.
    1.1  mrg
    1.1  mrg    In either case primes can be combined to make divisors.  The table data
    1.1  mrg    then effectively indicates remainders which are quadratic residues mod
    1.1  mrg    all the primes.  This sort of combining reduces the number of steps
    1.1  mrg    needed after mpn_mod_34lsub1 or mpn_mod_1, saving code size and time.
    1.1  mrg    Nothing is gained or lost in terms of detections, the same total fraction
    1.1  mrg    of non-residues will be identified.
    1.1  mrg
    1.1  mrg    Nothing particularly sophisticated is attempted for combining factors to
    1.1  mrg    make divisors.  This is probably a kind of knapsack problem so it'd be
    1.1  mrg    too hard to attempt anything completely general.  For the usual 32 and 64
    1.1  mrg    bit limbs we get a good enough result just pairing the biggest and
    1.1  mrg    smallest which fit together, repeatedly.
    1.1  mrg
    1.1  mrg    Another aim is to get powerful combinations, ie. divisors which identify
    1.1  mrg    biggest fraction of non-residues, and have those run first.  Again for
    1.1  mrg    the usual 32 and 64 bits it seems good enough just to pair for big
    1.1  mrg    divisors then sort according to the resulting fraction of non-residues
    1.1  mrg    identified.
    1.1  mrg
    1.1  mrg    Also in this program, a table sq_res_0x100 of residues modulo 256 is
    1.1  mrg    generated.  This simply fills bits into limbs of the appropriate
    1.1  mrg    build-time GMP_LIMB_BITS each.
    1.1  mrg
    1.1  mrg */
    1.1  mrg
    1.1  mrg
    1.1  mrg /* Normally we aren't using const in gen*.c programs, so as not to have to
    1.1  mrg    bother figuring out if it works, but using it with f_cmp_divisor and
    1.1  mrg    f_cmp_fraction avoids warnings from the qsort calls. */
    1.1  mrg
    1.1  mrg /* Same tests as gmp.h. */
    1.1  mrg #if  defined (__STDC__)                                 \
    1.1  mrg   || defined (__cplusplus)                              \
    1.1  mrg   || defined (_AIX)                                     \
    1.1  mrg   || defined (__DECC)                                   \
    1.1  mrg   || (defined (__mips) && defined (_SYSTYPE_SVR4))      \
    1.1  mrg   || defined (_MSC_VER)                                 \
    1.1  mrg   || defined (_WIN32)
    1.1  mrg #define HAVE_CONST        1
    1.1  mrg #endif
    1.1  mrg
    1.1  mrg #if ! HAVE_CONST
    1.1  mrg #define const
    1.1  mrg #endif
    1.1  mrg
    1.1  mrg
    1.1  mrg mpz_t  *sq_res_0x100;          /* table of limbs */
    1.1  mrg int    nsq_res_0x100;          /* elements in sq_res_0x100 array */
    1.1  mrg int    sq_res_0x100_num;       /* squares in sq_res_0x100 */
    1.1  mrg double sq_res_0x100_fraction;  /* sq_res_0x100_num / 256 */
    1.1  mrg
    1.1  mrg int     mod34_bits;        /* 3*GMP_NUMB_BITS/4 */
    1.1  mrg int     mod_bits;          /* bits from PERFSQR_MOD_34 or MOD_PP */
    1.1  mrg int     max_divisor;       /* all divisors <= max_divisor */
    1.1  mrg int     max_divisor_bits;  /* ceil(log2(max_divisor)) */
    1.1  mrg double  total_fraction;    /* of squares */
    1.1  mrg mpz_t   pp;                /* product of primes, or 0 if mod_34lsub1 used */
    1.1  mrg mpz_t   pp_norm;           /* pp shifted so NUMB high bit set */
    1.1  mrg mpz_t   pp_inverted;       /* invert_limb style inverse */
    1.1  mrg mpz_t   mod_mask;          /* 2^mod_bits-1 */
    1.1  mrg char    mod34_excuse[128]; /* why mod_34lsub1 not used (if it's not) */
    1.1  mrg
    1.1  mrg /* raw list of divisors of 2^mod34_bits-1 or pp, just to show in a comment */
    1.1  mrg struct rawfactor_t {
    1.1  mrg   int     divisor;
    1.1  mrg   int     multiplicity;
    1.1  mrg };
    1.1  mrg struct rawfactor_t  *rawfactor;
    1.1  mrg int                 nrawfactor;
    1.1  mrg
    1.1  mrg /* factors of 2^mod34_bits-1 or pp and associated data, after combining etc */
    1.1  mrg struct factor_t {
    1.1  mrg   int     divisor;
    1.1  mrg   mpz_t   inverse;   /* 1/divisor mod 2^mod_bits */
    1.1  mrg   mpz_t   mask;      /* indicating squares mod divisor */
    1.1  mrg   double  fraction;  /* squares/total */
    1.1  mrg };
    1.1  mrg struct factor_t  *factor;
    1.1  mrg int              nfactor;       /* entries in use in factor array */
    1.1  mrg int              factor_alloc;  /* entries allocated to factor array */
    1.1  mrg
    1.1  mrg
    1.1  mrg int
    1.1  mrg f_cmp_divisor (const void *parg, const void *qarg)
    1.1  mrg {
    1.1  mrg   const struct factor_t *p, *q;
1.1.1.3  mrg   p = (const struct factor_t *) parg;
1.1.1.3  mrg   q = (const struct factor_t *) qarg;
    1.1  mrg   if (p->divisor > q->divisor)
    1.1  mrg     return 1;
    1.1  mrg   else if (p->divisor < q->divisor)
    1.1  mrg     return -1;
    1.1  mrg   else
    1.1  mrg     return 0;
    1.1  mrg }
    1.1  mrg
    1.1  mrg int
    1.1  mrg f_cmp_fraction (const void *parg, const void *qarg)
    1.1  mrg {
    1.1  mrg   const struct factor_t *p, *q;
1.1.1.3  mrg   p = (const struct factor_t *) parg;
1.1.1.3  mrg   q = (const struct factor_t *) qarg;
    1.1  mrg   if (p->fraction > q->fraction)
    1.1  mrg     return 1;
    1.1  mrg   else if (p->fraction < q->fraction)
    1.1  mrg     return -1;
    1.1  mrg   else
    1.1  mrg     return 0;
    1.1  mrg }
    1.1  mrg
    1.1  mrg /* Remove array[idx] by copying the remainder down, and adjust narray
    1.1  mrg    accordingly.  */
    1.1  mrg #define COLLAPSE_ELEMENT(array, idx, narray)                    \
    1.1  mrg   do {                                                          \
1.1.1.2  mrg     memmove (&(array)[idx],					\
1.1.1.2  mrg 	     &(array)[idx+1],					\
1.1.1.2  mrg 	     ((narray)-((idx)+1)) * sizeof (array[0]));		\
    1.1  mrg     (narray)--;                                                 \
    1.1  mrg   } while (0)
    1.1  mrg
    1.1  mrg
    1.1  mrg /* return n*2^p mod m */
    1.1  mrg int
    1.1  mrg mul_2exp_mod (int n, int p, int m)
    1.1  mrg {
1.1.1.3  mrg   while (--p >= 0)
    1.1  mrg     n = (2 * n) % m;
    1.1  mrg   return n;
    1.1  mrg }
    1.1  mrg
    1.1  mrg /* return -n mod m */
    1.1  mrg int
    1.1  mrg neg_mod (int n, int m)
    1.1  mrg {
1.1.1.2  mrg   assert (n >= 0 && n < m);
    1.1  mrg   return (n == 0 ? 0 : m-n);
    1.1  mrg }
    1.1  mrg
    1.1  mrg /* Set "mask" to a value such that "mask & (1<<idx)" is non-zero if
    1.1  mrg    "-(idx<<mod_bits)" can be a square modulo m.  */
    1.1  mrg void
    1.1  mrg square_mask (mpz_t mask, int m)
    1.1  mrg {
    1.1  mrg   int    p, i, r, idx;
    1.1  mrg
    1.1  mrg   p = mul_2exp_mod (1, mod_bits, m);
    1.1  mrg   p = neg_mod (p, m);
    1.1  mrg
    1.1  mrg   mpz_set_ui (mask, 0L);
    1.1  mrg   for (i = 0; i < m; i++)
    1.1  mrg     {
    1.1  mrg       r = (i * i) % m;
    1.1  mrg       idx = (r * p) % m;
    1.1  mrg       mpz_setbit (mask, (unsigned long) idx);
    1.1  mrg     }
    1.1  mrg }
    1.1  mrg
    1.1  mrg void
    1.1  mrg generate_sq_res_0x100 (int limb_bits)
    1.1  mrg {
    1.1  mrg   int  i, res;
    1.1  mrg
    1.1  mrg   nsq_res_0x100 = (0x100 + limb_bits - 1) / limb_bits;
1.1.1.3  mrg   sq_res_0x100 = (mpz_t *) xmalloc (nsq_res_0x100 * sizeof (*sq_res_0x100));
    1.1  mrg
    1.1  mrg   for (i = 0; i < nsq_res_0x100; i++)
    1.1  mrg     mpz_init_set_ui (sq_res_0x100[i], 0L);
    1.1  mrg
    1.1  mrg   for (i = 0; i < 0x100; i++)
    1.1  mrg     {
    1.1  mrg       res = (i * i) % 0x100;
    1.1  mrg       mpz_setbit (sq_res_0x100[res / limb_bits],
    1.1  mrg                   (unsigned long) (res % limb_bits));
    1.1  mrg     }
    1.1  mrg
    1.1  mrg   sq_res_0x100_num = 0;
    1.1  mrg   for (i = 0; i < nsq_res_0x100; i++)
    1.1  mrg     sq_res_0x100_num += mpz_popcount (sq_res_0x100[i]);
    1.1  mrg   sq_res_0x100_fraction = (double) sq_res_0x100_num / 256.0;
    1.1  mrg }
    1.1  mrg
    1.1  mrg void
    1.1  mrg generate_mod (int limb_bits, int nail_bits)
    1.1  mrg {
    1.1  mrg   int    numb_bits = limb_bits - nail_bits;
    1.1  mrg   int    i, divisor;
    1.1  mrg
    1.1  mrg   mpz_init_set_ui (pp, 0L);
    1.1  mrg   mpz_init_set_ui (pp_norm, 0L);
    1.1  mrg   mpz_init_set_ui (pp_inverted, 0L);
    1.1  mrg
    1.1  mrg   /* no more than limb_bits many factors in a one limb modulus (and of
    1.1  mrg      course in reality nothing like that many) */
    1.1  mrg   factor_alloc = limb_bits;
1.1.1.3  mrg   factor = (struct factor_t *) xmalloc (factor_alloc * sizeof (*factor));
1.1.1.3  mrg   rawfactor = (struct rawfactor_t *) xmalloc (factor_alloc * sizeof (*rawfactor));
    1.1  mrg
    1.1  mrg   if (numb_bits % 4 != 0)
    1.1  mrg     {
    1.1  mrg       strcpy (mod34_excuse, "GMP_NUMB_BITS % 4 != 0");
    1.1  mrg       goto use_pp;
    1.1  mrg     }
    1.1  mrg
    1.1  mrg   max_divisor = 2*limb_bits;
    1.1  mrg   max_divisor_bits = log2_ceil (max_divisor);
    1.1  mrg
    1.1  mrg   if (numb_bits / 4 < max_divisor_bits)
    1.1  mrg     {
    1.1  mrg       /* Wind back to one limb worth of max_divisor, if that will let us use
    1.1  mrg          mpn_mod_34lsub1.  */
    1.1  mrg       max_divisor = limb_bits;
    1.1  mrg       max_divisor_bits = log2_ceil (max_divisor);
    1.1  mrg
    1.1  mrg       if (numb_bits / 4 < max_divisor_bits)
    1.1  mrg         {
    1.1  mrg           strcpy (mod34_excuse, "GMP_NUMB_BITS / 4 too small");
    1.1  mrg           goto use_pp;
    1.1  mrg         }
    1.1  mrg     }
    1.1  mrg
    1.1  mrg   {
    1.1  mrg     /* Can use mpn_mod_34lsub1, find small factors of 2^mod34_bits-1. */
    1.1  mrg     mpz_t  m, q, r;
    1.1  mrg     int    multiplicity;
    1.1  mrg
    1.1  mrg     mod34_bits = (numb_bits / 4) * 3;
    1.1  mrg
    1.1  mrg     /* mpn_mod_34lsub1 returns a full limb value, PERFSQR_MOD_34 folds it at
    1.1  mrg        the mod34_bits mark, adding the two halves for a remainder of at most
    1.1  mrg        mod34_bits+1 many bits */
    1.1  mrg     mod_bits = mod34_bits + 1;
    1.1  mrg
    1.1  mrg     mpz_init_set_ui (m, 1L);
    1.1  mrg     mpz_mul_2exp (m, m, mod34_bits);
    1.1  mrg     mpz_sub_ui (m, m, 1L);
    1.1  mrg
    1.1  mrg     mpz_init (q);
    1.1  mrg     mpz_init (r);
    1.1  mrg
1.1.1.3  mrg     for (i = 3; i <= max_divisor; i+=2)
    1.1  mrg       {
    1.1  mrg         if (! isprime (i))
    1.1  mrg           continue;
    1.1  mrg
    1.1  mrg         mpz_tdiv_qr_ui (q, r, m, (unsigned long) i);
    1.1  mrg         if (mpz_sgn (r) != 0)
    1.1  mrg           continue;
    1.1  mrg
    1.1  mrg         /* if a repeated prime is found it's used as an i^n in one factor */
    1.1  mrg         divisor = 1;
    1.1  mrg         multiplicity = 0;
    1.1  mrg         do
    1.1  mrg           {
    1.1  mrg             if (divisor > max_divisor / i)
    1.1  mrg               break;
    1.1  mrg             multiplicity++;
    1.1  mrg             mpz_set (m, q);
    1.1  mrg             mpz_tdiv_qr_ui (q, r, m, (unsigned long) i);
    1.1  mrg           }
    1.1  mrg         while (mpz_sgn (r) == 0);
    1.1  mrg
1.1.1.2  mrg         assert (nrawfactor < factor_alloc);
    1.1  mrg         rawfactor[nrawfactor].divisor = i;
    1.1  mrg         rawfactor[nrawfactor].multiplicity = multiplicity;
    1.1  mrg         nrawfactor++;
    1.1  mrg       }
    1.1  mrg
    1.1  mrg     mpz_clear (m);
    1.1  mrg     mpz_clear (q);
    1.1  mrg     mpz_clear (r);
    1.1  mrg   }
    1.1  mrg
    1.1  mrg   if (nrawfactor <= 2)
    1.1  mrg     {
    1.1  mrg       mpz_t  new_pp;
    1.1  mrg
    1.1  mrg       sprintf (mod34_excuse, "only %d small factor%s",
    1.1  mrg                nrawfactor, nrawfactor == 1 ? "" : "s");
    1.1  mrg
    1.1  mrg     use_pp:
    1.1  mrg       /* reset to two limbs of max_divisor, in case the mpn_mod_34lsub1 code
    1.1  mrg          tried with just one */
    1.1  mrg       max_divisor = 2*limb_bits;
    1.1  mrg       max_divisor_bits = log2_ceil (max_divisor);
    1.1  mrg
    1.1  mrg       mpz_init (new_pp);
    1.1  mrg       nrawfactor = 0;
    1.1  mrg       mod_bits = MIN (numb_bits, limb_bits - max_divisor_bits);
    1.1  mrg
    1.1  mrg       /* one copy of each small prime */
    1.1  mrg       mpz_set_ui (pp, 1L);
1.1.1.3  mrg       for (i = 3; i <= max_divisor; i+=2)
    1.1  mrg         {
    1.1  mrg           if (! isprime (i))
    1.1  mrg             continue;
    1.1  mrg
    1.1  mrg           mpz_mul_ui (new_pp, pp, (unsigned long) i);
    1.1  mrg           if (mpz_sizeinbase (new_pp, 2) > mod_bits)
    1.1  mrg             break;
    1.1  mrg           mpz_set (pp, new_pp);
    1.1  mrg
1.1.1.2  mrg           assert (nrawfactor < factor_alloc);
    1.1  mrg           rawfactor[nrawfactor].divisor = i;
    1.1  mrg           rawfactor[nrawfactor].multiplicity = 1;
    1.1  mrg           nrawfactor++;
    1.1  mrg         }
    1.1  mrg
    1.1  mrg       /* Plus an extra copy of one or more of the primes selected, if that
    1.1  mrg          still fits in max_divisor and the total in mod_bits.  Usually only
    1.1  mrg          3 or 5 will be candidates */
    1.1  mrg       for (i = nrawfactor-1; i >= 0; i--)
    1.1  mrg         {
    1.1  mrg           if (rawfactor[i].divisor > max_divisor / rawfactor[i].divisor)
    1.1  mrg             continue;
    1.1  mrg           mpz_mul_ui (new_pp, pp, (unsigned long) rawfactor[i].divisor);
    1.1  mrg           if (mpz_sizeinbase (new_pp, 2) > mod_bits)
    1.1  mrg             continue;
    1.1  mrg           mpz_set (pp, new_pp);
    1.1  mrg
    1.1  mrg           rawfactor[i].multiplicity++;
    1.1  mrg         }
    1.1  mrg
    1.1  mrg       mod_bits = mpz_sizeinbase (pp, 2);
    1.1  mrg
    1.1  mrg       mpz_set (pp_norm, pp);
    1.1  mrg       while (mpz_sizeinbase (pp_norm, 2) < numb_bits)
    1.1  mrg         mpz_add (pp_norm, pp_norm, pp_norm);
    1.1  mrg
    1.1  mrg       mpz_preinv_invert (pp_inverted, pp_norm, numb_bits);
    1.1  mrg
    1.1  mrg       mpz_clear (new_pp);
    1.1  mrg     }
    1.1  mrg
    1.1  mrg   /* start the factor array */
    1.1  mrg   for (i = 0; i < nrawfactor; i++)
    1.1  mrg     {
    1.1  mrg       int  j;
1.1.1.2  mrg       assert (nfactor < factor_alloc);
    1.1  mrg       factor[nfactor].divisor = 1;
    1.1  mrg       for (j = 0; j < rawfactor[i].multiplicity; j++)
    1.1  mrg         factor[nfactor].divisor *= rawfactor[i].divisor;
    1.1  mrg       nfactor++;
    1.1  mrg     }
    1.1  mrg
    1.1  mrg  combine:
    1.1  mrg   /* Combine entries in the factor array.  Combine the smallest entry with
    1.1  mrg      the biggest one that will fit with it (ie. under max_divisor), then
    1.1  mrg      repeat that with the new smallest entry. */
    1.1  mrg   qsort (factor, nfactor, sizeof (factor[0]), f_cmp_divisor);
    1.1  mrg   for (i = nfactor-1; i >= 1; i--)
    1.1  mrg     {
    1.1  mrg       if (factor[i].divisor <= max_divisor / factor[0].divisor)
    1.1  mrg         {
    1.1  mrg           factor[0].divisor *= factor[i].divisor;
    1.1  mrg           COLLAPSE_ELEMENT (factor, i, nfactor);
    1.1  mrg           goto combine;
    1.1  mrg         }
    1.1  mrg     }
    1.1  mrg
    1.1  mrg   total_fraction = 1.0;
    1.1  mrg   for (i = 0; i < nfactor; i++)
    1.1  mrg     {
    1.1  mrg       mpz_init (factor[i].inverse);
    1.1  mrg       mpz_invert_ui_2exp (factor[i].inverse,
    1.1  mrg                           (unsigned long) factor[i].divisor,
    1.1  mrg                           (unsigned long) mod_bits);
    1.1  mrg
    1.1  mrg       mpz_init (factor[i].mask);
    1.1  mrg       square_mask (factor[i].mask, factor[i].divisor);
    1.1  mrg
    1.1  mrg       /* fraction of possible squares */
    1.1  mrg       factor[i].fraction = (double) mpz_popcount (factor[i].mask)
    1.1  mrg         / factor[i].divisor;
    1.1  mrg
    1.1  mrg       /* total fraction of possible squares */
    1.1  mrg       total_fraction *= factor[i].fraction;
    1.1  mrg     }
    1.1  mrg
    1.1  mrg   /* best tests first (ie. smallest fraction) */
    1.1  mrg   qsort (factor, nfactor, sizeof (factor[0]), f_cmp_fraction);
    1.1  mrg }
    1.1  mrg
    1.1  mrg void
    1.1  mrg print (int limb_bits, int nail_bits)
    1.1  mrg {
    1.1  mrg   int    i;
    1.1  mrg   mpz_t  mhi, mlo;
    1.1  mrg
    1.1  mrg   printf ("/* This file generated by gen-psqr.c - DO NOT EDIT. */\n");
    1.1  mrg   printf ("\n");
    1.1  mrg
    1.1  mrg   printf ("#if GMP_LIMB_BITS != %d || GMP_NAIL_BITS != %d\n",
    1.1  mrg           limb_bits, nail_bits);
    1.1  mrg   printf ("Error, error, this data is for %d bit limb and %d bit nail\n",
    1.1  mrg           limb_bits, nail_bits);
    1.1  mrg   printf ("#endif\n");
    1.1  mrg   printf ("\n");
    1.1  mrg
    1.1  mrg   printf ("/* Non-zero bit indicates a quadratic residue mod 0x100.\n");
    1.1  mrg   printf ("   This test identifies %.2f%% as non-squares (%d/256). */\n",
    1.1  mrg           (1.0 - sq_res_0x100_fraction) * 100.0,
    1.1  mrg           0x100 - sq_res_0x100_num);
    1.1  mrg   printf ("static const mp_limb_t\n");
    1.1  mrg   printf ("sq_res_0x100[%d] = {\n", nsq_res_0x100);
    1.1  mrg   for (i = 0; i < nsq_res_0x100; i++)
    1.1  mrg     {
    1.1  mrg       printf ("  CNST_LIMB(0x");
    1.1  mrg       mpz_out_str (stdout, 16, sq_res_0x100[i]);
    1.1  mrg       printf ("),\n");
    1.1  mrg     }
    1.1  mrg   printf ("};\n");
    1.1  mrg   printf ("\n");
    1.1  mrg
    1.1  mrg   if (mpz_sgn (pp) != 0)
    1.1  mrg     {
    1.1  mrg       printf ("/* mpn_mod_34lsub1 not used due to %s */\n", mod34_excuse);
    1.1  mrg       printf ("/* PERFSQR_PP = ");
    1.1  mrg     }
    1.1  mrg   else
    1.1  mrg     printf ("/* 2^%d-1 = ", mod34_bits);
    1.1  mrg   for (i = 0; i < nrawfactor; i++)
    1.1  mrg     {
    1.1  mrg       if (i != 0)
    1.1  mrg         printf (" * ");
    1.1  mrg       printf ("%d", rawfactor[i].divisor);
    1.1  mrg       if (rawfactor[i].multiplicity != 1)
    1.1  mrg         printf ("^%d", rawfactor[i].multiplicity);
    1.1  mrg     }
    1.1  mrg   printf (" %s*/\n", mpz_sgn (pp) == 0 ? "... " : "");
    1.1  mrg
    1.1  mrg   printf ("#define PERFSQR_MOD_BITS  %d\n", mod_bits);
    1.1  mrg   if (mpz_sgn (pp) != 0)
    1.1  mrg     {
    1.1  mrg       printf ("#define PERFSQR_PP            CNST_LIMB(0x");
    1.1  mrg       mpz_out_str (stdout, 16, pp);
    1.1  mrg       printf (")\n");
    1.1  mrg       printf ("#define PERFSQR_PP_NORM       CNST_LIMB(0x");
    1.1  mrg       mpz_out_str (stdout, 16, pp_norm);
    1.1  mrg       printf (")\n");
    1.1  mrg       printf ("#define PERFSQR_PP_INVERTED   CNST_LIMB(0x");
    1.1  mrg       mpz_out_str (stdout, 16, pp_inverted);
    1.1  mrg       printf (")\n");
    1.1  mrg     }
    1.1  mrg   printf ("\n");
    1.1  mrg
    1.1  mrg   mpz_init (mhi);
    1.1  mrg   mpz_init (mlo);
    1.1  mrg
    1.1  mrg   printf ("/* This test identifies %.2f%% as non-squares. */\n",
    1.1  mrg           (1.0 - total_fraction) * 100.0);
    1.1  mrg   printf ("#define PERFSQR_MOD_TEST(up, usize) \\\n");
    1.1  mrg   printf ("  do {                              \\\n");
    1.1  mrg   printf ("    mp_limb_t  r;                   \\\n");
    1.1  mrg   if (mpz_sgn (pp) != 0)
    1.1  mrg     printf ("    PERFSQR_MOD_PP (r, up, usize);  \\\n");
    1.1  mrg   else
    1.1  mrg     printf ("    PERFSQR_MOD_34 (r, up, usize);  \\\n");
    1.1  mrg
    1.1  mrg   for (i = 0; i < nfactor; i++)
    1.1  mrg     {
    1.1  mrg       printf ("                                    \\\n");
    1.1  mrg       printf ("    /* %5.2f%% */                    \\\n",
    1.1  mrg               (1.0 - factor[i].fraction) * 100.0);
    1.1  mrg
    1.1  mrg       printf ("    PERFSQR_MOD_%d (r, CNST_LIMB(%2d), CNST_LIMB(0x",
    1.1  mrg               factor[i].divisor <= limb_bits ? 1 : 2,
    1.1  mrg               factor[i].divisor);
    1.1  mrg       mpz_out_str (stdout, 16, factor[i].inverse);
    1.1  mrg       printf ("), \\\n");
    1.1  mrg       printf ("                   CNST_LIMB(0x");
    1.1  mrg
    1.1  mrg       if ( factor[i].divisor <= limb_bits)
    1.1  mrg         {
    1.1  mrg           mpz_out_str (stdout, 16, factor[i].mask);
    1.1  mrg         }
    1.1  mrg       else
    1.1  mrg         {
    1.1  mrg           mpz_tdiv_r_2exp (mlo, factor[i].mask, (unsigned long) limb_bits);
    1.1  mrg           mpz_tdiv_q_2exp (mhi, factor[i].mask, (unsigned long) limb_bits);
    1.1  mrg           mpz_out_str (stdout, 16, mhi);
    1.1  mrg           printf ("), CNST_LIMB(0x");
    1.1  mrg           mpz_out_str (stdout, 16, mlo);
    1.1  mrg         }
    1.1  mrg       printf (")); \\\n");
    1.1  mrg     }
    1.1  mrg
    1.1  mrg   printf ("  } while (0)\n");
    1.1  mrg   printf ("\n");
    1.1  mrg
    1.1  mrg   printf ("/* Grand total sq_res_0x100 and PERFSQR_MOD_TEST, %.2f%% non-squares. */\n",
    1.1  mrg           (1.0 - (total_fraction * 44.0/256.0)) * 100.0);
    1.1  mrg   printf ("\n");
    1.1  mrg
    1.1  mrg   printf ("/* helper for tests/mpz/t-perfsqr.c */\n");
    1.1  mrg   printf ("#define PERFSQR_DIVISORS  { 256,");
    1.1  mrg   for (i = 0; i < nfactor; i++)
    1.1  mrg       printf (" %d,", factor[i].divisor);
    1.1  mrg   printf (" }\n");
    1.1  mrg
    1.1  mrg
    1.1  mrg   mpz_clear (mhi);
    1.1  mrg   mpz_clear (mlo);
    1.1  mrg }
    1.1  mrg
    1.1  mrg int
    1.1  mrg main (int argc, char *argv[])
    1.1  mrg {
    1.1  mrg   int  limb_bits, nail_bits;
    1.1  mrg
    1.1  mrg   if (argc != 3)
    1.1  mrg     {
    1.1  mrg       fprintf (stderr, "Usage: gen-psqr <limbbits> <nailbits>\n");
    1.1  mrg       exit (1);
    1.1  mrg     }
    1.1  mrg
    1.1  mrg   limb_bits = atoi (argv[1]);
    1.1  mrg   nail_bits = atoi (argv[2]);
    1.1  mrg
    1.1  mrg   if (limb_bits <= 0
    1.1  mrg       || nail_bits < 0
    1.1  mrg       || nail_bits >= limb_bits)
    1.1  mrg     {
    1.1  mrg       fprintf (stderr, "Invalid limb/nail bits: %d %d\n",
    1.1  mrg                limb_bits, nail_bits);
    1.1  mrg       exit (1);
    1.1  mrg     }
    1.1  mrg
    1.1  mrg   generate_sq_res_0x100 (limb_bits);
    1.1  mrg   generate_mod (limb_bits, nail_bits);
    1.1  mrg
    1.1  mrg   print (limb_bits, nail_bits);
    1.1  mrg
    1.1  mrg   return 0;
    1.1  mrg }