15dfecf96Smrg/*
25dfecf96Smrg * Copyright (c) 2002 by The XFree86 Project, Inc.
35dfecf96Smrg *
45dfecf96Smrg * Permission is hereby granted, free of charge, to any person obtaining a
55dfecf96Smrg * copy of this software and associated documentation files (the "Software"),
65dfecf96Smrg * to deal in the Software without restriction, including without limitation
75dfecf96Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
85dfecf96Smrg * and/or sell copies of the Software, and to permit persons to whom the
95dfecf96Smrg * Software is furnished to do so, subject to the following conditions:
105dfecf96Smrg *
115dfecf96Smrg * The above copyright notice and this permission notice shall be included in
125dfecf96Smrg * all copies or substantial portions of the Software.
135dfecf96Smrg *
145dfecf96Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
155dfecf96Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
165dfecf96Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
175dfecf96Smrg * THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
185dfecf96Smrg * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
195dfecf96Smrg * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
205dfecf96Smrg * SOFTWARE.
215dfecf96Smrg *
225dfecf96Smrg * Except as contained in this notice, the name of the XFree86 Project shall
235dfecf96Smrg * not be used in advertising or otherwise to promote the sale, use or other
245dfecf96Smrg * dealings in this Software without prior written authorization from the
255dfecf96Smrg * XFree86 Project.
265dfecf96Smrg *
275dfecf96Smrg * Author: Paulo César Pereira de Andrade
285dfecf96Smrg */
295dfecf96Smrg
305dfecf96Smrg/* $XFree86: xc/programs/xedit/lisp/mp/mp.c,v 1.2 2002/11/08 08:01:00 paulo Exp $ */
315dfecf96Smrg
325dfecf96Smrg#include "mp.h"
335dfecf96Smrg
345dfecf96Smrg/*
355dfecf96Smrg * TODO:
365dfecf96Smrg *	o Optimize squaring
375dfecf96Smrg *	o Write better division code and move from mpi.c to here
385dfecf96Smrg *	o Make multiplication code don't required memory to be zeroed
395dfecf96Smrg *		+ The first step is easy, just multiply the low word,
405dfecf96Smrg *		then the high word, that may overlap with the result
415dfecf96Smrg *		of the first multiply (in case of carry), and then
425dfecf96Smrg *		just make sure carry is properly propagated in the
435dfecf96Smrg *		subsequent multiplications.
445dfecf96Smrg *		+ Some code needs also to be rewritten because some
455dfecf96Smrg *		intermediate addition code in mp_mul, mp_karatsuba_mul,
465dfecf96Smrg *		and mp_toom_mul is assuming the memory is zeroed.
475dfecf96Smrg */
485dfecf96Smrg
495dfecf96Smrg/*
505dfecf96Smrg * Prototypes
515dfecf96Smrg */
525dfecf96Smrg	/* out of memory handler */
535dfecf96Smrgstatic void mp_outmem(void);
545dfecf96Smrg
555dfecf96Smrg	/* memory allocation fallback functions */
565dfecf96Smrgstatic void *_mp_malloc(size_t);
575dfecf96Smrgstatic void *_mp_calloc(size_t, size_t);
585dfecf96Smrgstatic void *_mp_realloc(void*, size_t);
595dfecf96Smrgstatic void _mp_free(void*);
605dfecf96Smrg
615dfecf96Smrg/*
625dfecf96Smrg * Initialization
635dfecf96Smrg */
645dfecf96Smrgstatic mp_malloc_fun __mp_malloc = _mp_malloc;
655dfecf96Smrgstatic mp_calloc_fun __mp_calloc = _mp_calloc;
665dfecf96Smrgstatic mp_realloc_fun __mp_realloc = _mp_realloc;
675dfecf96Smrgstatic mp_free_fun __mp_free = _mp_free;
685dfecf96Smrg
695dfecf96Smrg/*
705dfecf96Smrg * Implementation
715dfecf96Smrg */
725dfecf96Smrgstatic void
735dfecf96Smrgmp_outmem(void)
745dfecf96Smrg{
755dfecf96Smrg    fprintf(stderr, "out of memory in MP library.\n");
765dfecf96Smrg    exit(1);
775dfecf96Smrg}
785dfecf96Smrg
795dfecf96Smrgstatic void *
805dfecf96Smrg_mp_malloc(size_t size)
815dfecf96Smrg{
825dfecf96Smrg    return (malloc(size));
835dfecf96Smrg}
845dfecf96Smrg
855dfecf96Smrgvoid *
865dfecf96Smrgmp_malloc(size_t size)
875dfecf96Smrg{
885dfecf96Smrg    void *pointer = (*__mp_malloc)(size);
895dfecf96Smrg
905dfecf96Smrg    if (pointer == NULL)
915dfecf96Smrg	mp_outmem();
925dfecf96Smrg
935dfecf96Smrg    return (pointer);
945dfecf96Smrg}
955dfecf96Smrg
965dfecf96Smrgmp_malloc_fun
975dfecf96Smrgmp_set_malloc(mp_malloc_fun fun)
985dfecf96Smrg{
995dfecf96Smrg    mp_malloc_fun old = __mp_malloc;
1005dfecf96Smrg
1015dfecf96Smrg    __mp_malloc = fun;
1025dfecf96Smrg
1035dfecf96Smrg    return (old);
1045dfecf96Smrg}
1055dfecf96Smrg
1065dfecf96Smrgstatic void *
1075dfecf96Smrg_mp_calloc(size_t nmemb, size_t size)
1085dfecf96Smrg{
1095dfecf96Smrg    return (calloc(nmemb, size));
1105dfecf96Smrg}
1115dfecf96Smrg
1125dfecf96Smrgvoid *
1135dfecf96Smrgmp_calloc(size_t nmemb, size_t size)
1145dfecf96Smrg{
1155dfecf96Smrg    void *pointer = (*__mp_calloc)(nmemb, size);
1165dfecf96Smrg
1175dfecf96Smrg    if (pointer == NULL)
1185dfecf96Smrg	mp_outmem();
1195dfecf96Smrg
1205dfecf96Smrg    return (pointer);
1215dfecf96Smrg}
1225dfecf96Smrg
1235dfecf96Smrgmp_calloc_fun
1245dfecf96Smrgmp_set_calloc(mp_calloc_fun fun)
1255dfecf96Smrg{
1265dfecf96Smrg    mp_calloc_fun old = __mp_calloc;
1275dfecf96Smrg
1285dfecf96Smrg    __mp_calloc = fun;
1295dfecf96Smrg
1305dfecf96Smrg    return (old);
1315dfecf96Smrg}
1325dfecf96Smrg
1335dfecf96Smrgstatic void *
1345dfecf96Smrg_mp_realloc(void *old, size_t size)
1355dfecf96Smrg{
1365dfecf96Smrg    return (realloc(old, size));
1375dfecf96Smrg}
1385dfecf96Smrg
1395dfecf96Smrgvoid *
1405dfecf96Smrgmp_realloc(void *old, size_t size)
1415dfecf96Smrg{
1425dfecf96Smrg    void *pointer = (*__mp_realloc)(old, size);
1435dfecf96Smrg
1445dfecf96Smrg    if (pointer == NULL)
1455dfecf96Smrg	mp_outmem();
1465dfecf96Smrg
1475dfecf96Smrg    return (pointer);
1485dfecf96Smrg}
1495dfecf96Smrg
1505dfecf96Smrgmp_realloc_fun
1515dfecf96Smrgmp_set_realloc(mp_realloc_fun fun)
1525dfecf96Smrg{
1535dfecf96Smrg    mp_realloc_fun old = __mp_realloc;
1545dfecf96Smrg
1555dfecf96Smrg    __mp_realloc = fun;
1565dfecf96Smrg
1575dfecf96Smrg    return (old);
1585dfecf96Smrg}
1595dfecf96Smrg
1605dfecf96Smrgstatic void
1615dfecf96Smrg_mp_free(void *pointer)
1625dfecf96Smrg{
1635dfecf96Smrg    free(pointer);
1645dfecf96Smrg}
1655dfecf96Smrg
1665dfecf96Smrgvoid
1675dfecf96Smrgmp_free(void *pointer)
1685dfecf96Smrg{
1695dfecf96Smrg    (*__mp_free)(pointer);
1705dfecf96Smrg}
1715dfecf96Smrg
1725dfecf96Smrgmp_free_fun
1735dfecf96Smrgmp_set_free(mp_free_fun fun)
1745dfecf96Smrg{
1755dfecf96Smrg    mp_free_fun old = __mp_free;
1765dfecf96Smrg
1775dfecf96Smrg    __mp_free = fun;
1785dfecf96Smrg
1795dfecf96Smrg    return (old);
1805dfecf96Smrg}
1815dfecf96Smrg
1825dfecf96Smrglong
1835dfecf96Smrgmp_add(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2)
1845dfecf96Smrg{
1855dfecf96Smrg    BNI value;			/* intermediate result */
1865dfecf96Smrg    BNS carry;			/* carry flag */
1875dfecf96Smrg    long size;			/* result size */
1885dfecf96Smrg
1895dfecf96Smrg    if (len1 < len2)
1905dfecf96Smrg	MP_SWAP(op1, op2, len1, len2);
1915dfecf96Smrg
1925dfecf96Smrg    /* unroll start of loop */
19331de2854Smrg    value = (BNI)op1[0] + op2[0];
1945dfecf96Smrg    rop[0] = value;
1955dfecf96Smrg    carry = value >> BNSBITS;
1965dfecf96Smrg
1975dfecf96Smrg    /* add op1 and op2 */
1985dfecf96Smrg    for (size = 1; size < len2; size++) {
19931de2854Smrg	value = (BNI)op1[size] + op2[size] + carry;
2005dfecf96Smrg	rop[size] = value;
2015dfecf96Smrg	carry = value >> BNSBITS;
2025dfecf96Smrg    }
2035dfecf96Smrg    if (rop != op1) {
2045dfecf96Smrg	for (; size < len1; size++) {
20531de2854Smrg	    value = (BNI)op1[size] + carry;
2065dfecf96Smrg	    rop[size] = value;
2075dfecf96Smrg	    carry = value >> BNSBITS;
2085dfecf96Smrg	}
2095dfecf96Smrg    }
2105dfecf96Smrg    else {
2115dfecf96Smrg	/* if rop == op1, than just adjust carry */
2125dfecf96Smrg	for (; carry && size < len1; size++) {
21331de2854Smrg	    value = (BNI)op1[size] + carry;
2145dfecf96Smrg	    rop[size] = value;
2155dfecf96Smrg	    carry = value >> BNSBITS;
2165dfecf96Smrg	}
2175dfecf96Smrg	size = len1;
2185dfecf96Smrg    }
2195dfecf96Smrg    if (carry)
2205dfecf96Smrg	rop[size++] = carry;
2215dfecf96Smrg
2225dfecf96Smrg    return (size);
2235dfecf96Smrg}
2245dfecf96Smrg
2255dfecf96Smrglong
2265dfecf96Smrgmp_sub(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2)
2275dfecf96Smrg{
2285dfecf96Smrg    long svalue;		/* intermediate result */
2295dfecf96Smrg    BNS carry;			/* carry flag */
2305dfecf96Smrg    long size;			/* result size */
2315dfecf96Smrg
2325dfecf96Smrg    /* special case */
2335dfecf96Smrg    if (op1 == op2) {
2345dfecf96Smrg	rop[0] = 0;
2355dfecf96Smrg
2365dfecf96Smrg	return (1);
2375dfecf96Smrg    }
2385dfecf96Smrg
2395dfecf96Smrg    /* unroll start of loop */
24031de2854Smrg    svalue = (long)op1[0] - op2[0];
2415dfecf96Smrg    rop[0] = svalue;
2425dfecf96Smrg    carry = svalue < 0;
2435dfecf96Smrg
2445dfecf96Smrg    /* subtracts op2 from op1 */
2455dfecf96Smrg    for (size = 1; size < len2; size++) {
2465dfecf96Smrg	svalue = (long)(op1[size]) - op2[size] - carry;
2475dfecf96Smrg	rop[size] = svalue;
2485dfecf96Smrg	carry = svalue < 0;
2495dfecf96Smrg    }
2505dfecf96Smrg    if (rop != op1) {
2515dfecf96Smrg	for (; size < len1; size++) {
2525dfecf96Smrg	    svalue = op1[size] - carry;
2535dfecf96Smrg	    rop[size] = svalue;
2545dfecf96Smrg	    carry = svalue < 0;
2555dfecf96Smrg	}
2565dfecf96Smrg    }
2575dfecf96Smrg    else {
2585dfecf96Smrg	/* if rop == op1, than just adjust carry */
2595dfecf96Smrg	for (; carry && size < len1; size++) {
26031de2854Smrg	    svalue = (long)op1[size] - carry;
2615dfecf96Smrg	    rop[size] = svalue;
2625dfecf96Smrg	    carry = svalue < 0;
2635dfecf96Smrg	}
2645dfecf96Smrg	size = len1;
2655dfecf96Smrg    }
2665dfecf96Smrg
2675dfecf96Smrg    /* calculate result size */
2685dfecf96Smrg    while (size > 1 && rop[size - 1] == 0)
2695dfecf96Smrg	--size;
2705dfecf96Smrg
2715dfecf96Smrg    return (size);
2725dfecf96Smrg}
2735dfecf96Smrg
2745dfecf96Smrglong
2755dfecf96Smrgmp_lshift(BNS *rop, BNS *op, BNI len, long shift)
2765dfecf96Smrg{
2775dfecf96Smrg    long i, size;
2785dfecf96Smrg    BNI words, bits;		/* how many word and bit shifts */
2795dfecf96Smrg
2805dfecf96Smrg    words = shift / BNSBITS;
2815dfecf96Smrg    bits = shift % BNSBITS;
2825dfecf96Smrg    size = len + words;
2835dfecf96Smrg
2845dfecf96Smrg    if (bits) {
2855dfecf96Smrg	BNS hi, lo;
2865dfecf96Smrg	BNI carry;
2875dfecf96Smrg	int adj;
2885dfecf96Smrg
2895dfecf96Smrg	for (i = 1, carry = CARRY >> 1; carry; i++, carry >>= 1)
2905dfecf96Smrg	    if (op[len - 1] & carry)
2915dfecf96Smrg		break;
2925dfecf96Smrg	adj = (bits + (BNSBITS - i)) / BNSBITS;
2935dfecf96Smrg	size += adj;
2945dfecf96Smrg
2955dfecf96Smrg	lo = hi = op[0];
2965dfecf96Smrg	rop[words] = lo << bits;
2975dfecf96Smrg	for (i = 1; i < len; i++) {
2985dfecf96Smrg	    hi = op[i];
2995dfecf96Smrg	    rop[words + i] = hi << bits | (lo >> (BNSBITS - bits));
3005dfecf96Smrg	    lo = hi;
3015dfecf96Smrg	}
3025dfecf96Smrg	if (adj)
3035dfecf96Smrg	    rop[size - 1] = hi >> (BNSBITS - bits);
3045dfecf96Smrg    }
3055dfecf96Smrg    else
3065dfecf96Smrg	memmove(rop + size - len, op, sizeof(BNS) * len);
3075dfecf96Smrg
3085dfecf96Smrg    if (words)
3095dfecf96Smrg	memset(rop, '\0', sizeof(BNS) * words);
3105dfecf96Smrg
3115dfecf96Smrg    return (size);
3125dfecf96Smrg}
3135dfecf96Smrg
3145dfecf96Smrglong
3155dfecf96Smrgmp_rshift(BNS *rop, BNS *op, BNI len, long shift)
3165dfecf96Smrg{
3175dfecf96Smrg    int adj = 0;
3185dfecf96Smrg    long i, size;
3195dfecf96Smrg    BNI words, bits;		/* how many word and bit shifts */
3205dfecf96Smrg
3215dfecf96Smrg    words = shift / BNSBITS;
3225dfecf96Smrg    bits = shift % BNSBITS;
3235dfecf96Smrg    size = len - words;
3245dfecf96Smrg
3255dfecf96Smrg    if (bits) {
3265dfecf96Smrg	BNS hi, lo;
3275dfecf96Smrg	BNI carry;
3285dfecf96Smrg
3295dfecf96Smrg	for (i = 0, carry = CARRY >> 1; carry; i++, carry >>= 1)
3305dfecf96Smrg	    if (op[len - 1] & carry)
3315dfecf96Smrg		break;
3325dfecf96Smrg	adj = (bits + i) / BNSBITS;
3335dfecf96Smrg	if (size - adj == 0) {
3345dfecf96Smrg	    rop[0] = 0;
3355dfecf96Smrg
3365dfecf96Smrg	    return (1);
3375dfecf96Smrg	}
3385dfecf96Smrg
3395dfecf96Smrg	hi = lo = op[words + size - 1];
3405dfecf96Smrg	rop[size - 1] = hi >> bits;
3415dfecf96Smrg	for (i = size - 2; i >= 0; i--) {
3425dfecf96Smrg	    lo = op[words + i];
3435dfecf96Smrg	    rop[i] = (lo >> bits) | (hi << (BNSBITS - bits));
3445dfecf96Smrg	    hi = lo;
3455dfecf96Smrg	}
3465dfecf96Smrg	if (adj)
3475dfecf96Smrg	    rop[0] |= lo << (BNSBITS - bits);
3485dfecf96Smrg    }
3495dfecf96Smrg    else
3505dfecf96Smrg	memmove(rop, op + len - size, size * sizeof(BNS));
3515dfecf96Smrg
3525dfecf96Smrg    return (size - adj);
3535dfecf96Smrg}
3545dfecf96Smrg
3555dfecf96Smrg	/* rop must be a pointer to len1 + len2 elements
3565dfecf96Smrg	 * rop cannot be either op1 or op2
3575dfecf96Smrg	 * rop must be all zeros */
3585dfecf96Smrglong
3595dfecf96Smrgmp_base_mul(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2)
3605dfecf96Smrg{
3615dfecf96Smrg    long i, j;			/* counters */
3625dfecf96Smrg    BNI value;			/* intermediate result */
3635dfecf96Smrg    BNS carry;			/* carry value */
3645dfecf96Smrg    long size = len1 + len2;
3655dfecf96Smrg
3665dfecf96Smrg    /* simple optimization: first pass does not need to deference rop[i+j] */
3675dfecf96Smrg    if (op1[0]) {
3685dfecf96Smrg	value = (BNI)(op1[0]) * op2[0];
3695dfecf96Smrg	rop[0] = value;
3705dfecf96Smrg	carry = (BNS)(value >> BNSBITS);
3715dfecf96Smrg	for (j = 1; j < len2; j++) {
3725dfecf96Smrg	    value = (BNI)(op1[0]) * op2[j] + carry;
3735dfecf96Smrg	    rop[j] = value;
3745dfecf96Smrg	    carry = (BNS)(value >> BNSBITS);
3755dfecf96Smrg	}
3765dfecf96Smrg	rop[j] = carry;
3775dfecf96Smrg    }
3785dfecf96Smrg
3795dfecf96Smrg    /* do the multiplication */
3805dfecf96Smrg    for (i = 1; i < len1; i++) {
3815dfecf96Smrg	if (op1[i]) {
3825dfecf96Smrg	    /* unrool loop initialization */
3835dfecf96Smrg	    value = (BNI)(op1[i]) * op2[0] + rop[i];
3845dfecf96Smrg	    rop[i] = value;
3855dfecf96Smrg	    carry = (BNS)(value >> BNSBITS);
3865dfecf96Smrg	    /* multiply */
3875dfecf96Smrg	    for (j = 1; j < len2; j++) {
3885dfecf96Smrg		value = (BNI)(op1[i]) * op2[j] + rop[i + j] + carry;
3895dfecf96Smrg		rop[i + j] = value;
3905dfecf96Smrg		carry = (BNS)(value >> BNSBITS);
3915dfecf96Smrg	    }
3925dfecf96Smrg	    rop[i + j] = carry;
3935dfecf96Smrg	}
3945dfecf96Smrg    }
3955dfecf96Smrg
3965dfecf96Smrg    if (size > 1 && rop[size - 1] == 0)
3975dfecf96Smrg	--size;
3985dfecf96Smrg
3995dfecf96Smrg    return (size);
4005dfecf96Smrg}
4015dfecf96Smrg
4025dfecf96Smrg	/* Karatsuba method
4035dfecf96Smrg	 * t + ((a0 + a1) (b0 + b1) - t - u) x + ux²
4045dfecf96Smrg	 * where t = a0b0 and u = a1b1
4055dfecf96Smrg	 *
4065dfecf96Smrg	 * Karatsuba method reduces the number of multiplications. Example:
4075dfecf96Smrg	 *	Square a 40 length number
4085dfecf96Smrg	 *	instead of a plain 40*40 = 1600 multiplies/adds, it does:
4095dfecf96Smrg	 *	20*20+20*20+20*20 = 1200
4105dfecf96Smrg	 *	but since it is recursive, every 20*20=400 is reduced to
4115dfecf96Smrg	 *	10*10+10*10+10*10=300
4125dfecf96Smrg	 *	and so on.
4135dfecf96Smrg	 * The multiplication by x and x² is a just a shift, as it is a
4145dfecf96Smrg	 * power of two, and is implemented below by just writting at the
4155dfecf96Smrg	 * correct offset */
4165dfecf96Smrglong
4175dfecf96Smrgmp_karatsuba_mul(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2)
4185dfecf96Smrg{
4195dfecf96Smrg    BNI x;				/* shift count */
4205dfecf96Smrg    BNI la0, la1, lb0, lb1;		/* length of a0, a1, b0, and b1 */
4215dfecf96Smrg    BNS *t;				/* temporary memory for t product */
4225dfecf96Smrg    BNS *u;				/* temporary memory for u product */
4235dfecf96Smrg    BNS *r;				/* pointer to rop */
4245dfecf96Smrg    long xlen, tlen, ulen;
4255dfecf96Smrg
4265dfecf96Smrg    /* calculate value of x, that is 2^(BNSBITS*x) */
4275dfecf96Smrg    if (len1 >= len2)
4285dfecf96Smrg	x = (len1 + 1) >> 1;
4295dfecf96Smrg    else
4305dfecf96Smrg	x = (len2 + 1) >> 1;
4315dfecf96Smrg
4325dfecf96Smrg    /* calculate length of operands */
4335dfecf96Smrg    la0 = x;
4345dfecf96Smrg    la1 = len1 - x;
4355dfecf96Smrg    lb0 = x;
4365dfecf96Smrg    lb1 = len2 - x;
4375dfecf96Smrg
4385dfecf96Smrg    /* allocate buffer for t and (a0 + a1) */
4395dfecf96Smrg    tlen = la0 + lb0;
4405dfecf96Smrg    t = mp_malloc(sizeof(BNS) * tlen);
4415dfecf96Smrg
4425dfecf96Smrg    /* allocate buffer for u and (b0 + b1) */
4435dfecf96Smrg    if (la1 + lb1 < lb0 + lb1 + 1)
4445dfecf96Smrg	ulen = lb0 + lb1 + 1;
4455dfecf96Smrg    else
4465dfecf96Smrg	ulen = la1 + lb1;
4475dfecf96Smrg    u = mp_malloc(sizeof(BNS) * ulen);
4485dfecf96Smrg
4495dfecf96Smrg    /* calculate a0 + a1, store result in t */
4505dfecf96Smrg    tlen = mp_add(t, op1, op1 + x, la0, la1);
4515dfecf96Smrg
4525dfecf96Smrg    /* calculate b0 + b1, store result in u */
4535dfecf96Smrg    ulen = mp_add(u, op2, op2 + x, lb0, lb1);
4545dfecf96Smrg
4555dfecf96Smrg    /* store (a0 + a1) * (b0 + b1) in rop */
4565dfecf96Smrg
4575dfecf96Smrg    r = rop + x;		/* multiplied by 2^(BNSBITS*x) */
4585dfecf96Smrg    xlen = mp_mul(r, t, u, tlen, ulen);
4595dfecf96Smrg
4605dfecf96Smrg    /* must zero t and u memory, this is required for mp_mul */
4615dfecf96Smrg
4625dfecf96Smrg    /* calculate t = a0 * b0 */
4635dfecf96Smrg    tlen = la0 + lb0;
4645dfecf96Smrg    memset(t, '\0', sizeof(BNS) * tlen);
4655dfecf96Smrg    tlen = mp_mul(t, op1, op2, la0, lb0);
4665dfecf96Smrg
4675dfecf96Smrg    /* calculate u = a1 * b1 */
4685dfecf96Smrg    ulen = la1 + lb1;
4695dfecf96Smrg    memset(u, '\0', sizeof(BNS) * ulen);
4705dfecf96Smrg    ulen = mp_mul(u, op1 + x, op2 + x, la1, lb1);
4715dfecf96Smrg
4725dfecf96Smrg    /* subtract t from partial result */
4735dfecf96Smrg    xlen = mp_sub(r, r, t, xlen, tlen);
4745dfecf96Smrg
4755dfecf96Smrg    /* subtract u form partial result */
4765dfecf96Smrg    xlen = mp_sub(r, r, u, xlen, ulen);
4775dfecf96Smrg
4785dfecf96Smrg    /* add ux^2 to partial result */
4795dfecf96Smrg
4805dfecf96Smrg    r = rop + (x << 1);		/* multiplied by x^2 = 2^(BNSBITS*x*2) */
4815dfecf96Smrg    xlen = len1 + len2;
4825dfecf96Smrg    xlen = mp_add(r, r, u, xlen, ulen);
4835dfecf96Smrg
4845dfecf96Smrg    /* now add t to final result */
4855dfecf96Smrg    xlen = mp_add(rop, rop, t, xlen, tlen);
4865dfecf96Smrg
4875dfecf96Smrg    mp_free(t);
4885dfecf96Smrg    mp_free(u);
4895dfecf96Smrg
4905dfecf96Smrg    if (xlen > 1 && rop[xlen - 1] == 0)
4915dfecf96Smrg	--xlen;
4925dfecf96Smrg
4935dfecf96Smrg    return (xlen);
4945dfecf96Smrg}
4955dfecf96Smrg
4965dfecf96Smrg	/* Toom method	(partially based on GMP documentation)
4975dfecf96Smrg	 * Evaluation at k = [ 0 1/2 1 2 oo ]
4985dfecf96Smrg	 * U(x) = (U2k + U1)k + U0
4995dfecf96Smrg	 * V(x) = (V2k + V1)k + V0
5005dfecf96Smrg	 * W(x) = U(x)V(x)
5015dfecf96Smrg	 *
5025dfecf96Smrg	 * Sample:
5035dfecf96Smrg	 *	123 * 456
5045dfecf96Smrg	 *
5055dfecf96Smrg	 *	EVALUATION:
5065dfecf96Smrg	 * U(0) = (1*0+2)*0+3	=> 3
5075dfecf96Smrg	 * U(1) = 1+(2+3*2)*2	=> 17
5085dfecf96Smrg	 * U(2) = 1+2+3		=> 6
5095dfecf96Smrg	 * U(3) = (1*2+2)*2+3	=> 11
5105dfecf96Smrg	 * U(4)	= 1+(2+3*0)*0	=> 1
5115dfecf96Smrg	 *
5125dfecf96Smrg	 * V(0) = (4*0+5)*0+6	=> 6
5135dfecf96Smrg	 * V(1) = 4+(5+6*2)*2	=> 38
5145dfecf96Smrg	 * V(2) = 4+5+6		=> 15
5155dfecf96Smrg	 * V(3) = (4*2+5)*2+6	=> 32
5165dfecf96Smrg	 * V(4)	= 4+(5+6*0)*0	=> 4
5175dfecf96Smrg	 *
5185dfecf96Smrg	 *	U = [ 3   17  6  11 1 ]
5195dfecf96Smrg	 *	V = [ 6   38 15  32 4 ]
5205dfecf96Smrg	 *	W = [ 18 646 90 352 4 ]
5215dfecf96Smrg	 *
5225dfecf96Smrg	 * After that, we have:
5235dfecf96Smrg	 *	a = 18					(w0 already known)
5245dfecf96Smrg	 *	b = 16w0 + 8w1 + 4w2 + 2w3 +   w4
5255dfecf96Smrg	 *	c =   w0 +  w1 +  w2 +  w3 +   w4
5265dfecf96Smrg	 *	d =   w0 + 2w1 + 4w2 + 8w3 + 16w4
5275dfecf96Smrg	 *	e = 4					(w4 already known)
5285dfecf96Smrg	 *
5295dfecf96Smrg	 *	INTERPOLATION:
5305dfecf96Smrg	 *	b = b -16a - e		(354)
5315dfecf96Smrg	 *	c = c - a - e		(68)
5325dfecf96Smrg	 *	d = d - a - 16e		(270)
5335dfecf96Smrg	 *
5345dfecf96Smrg	 *	w = (b + d) - 8c = (10w1+8w2+10w3) - (8w1+8w2+8w3) = 2w1+2w3
5355dfecf96Smrg	 *	w = 2c - w		(56)
5365dfecf96Smrg	 *	b = b/2 = 4w1+w+w3
5375dfecf96Smrg	 *	b = b-c = 4w1+w+w3 - w1+w2+w3 = 3w1+w2
5385dfecf96Smrg	 *	c = w/2			(w2 = 28)
5395dfecf96Smrg	 *	b = b-c = 3w1+c - c = 3w1
5405dfecf96Smrg	 *	b = b/3			(w1 = 27)
5415dfecf96Smrg	 *	d = d/2
5425dfecf96Smrg	 *	d = d-b-w = b+w+4w3 - b-w = 4w3
5435dfecf96Smrg	 *	d = d/4			(w3 = 13)
5445dfecf96Smrg	 *
5455dfecf96Smrg	 *	RESULT:
5465dfecf96Smrg	 *	w4*10^4 + w3*10³ + w2*10² + w1*10 + w0
5475dfecf96Smrg	 *	40000   + 13000   + 2800   + 270    + 18
5485dfecf96Smrg	 *	10 is the base where the calculation was done
5495dfecf96Smrg	 *
5505dfecf96Smrg	 *	This sample uses small numbers, so it does not show the
5515dfecf96Smrg	 * advantage of the method. But for example (in base 10), when squaring
5525dfecf96Smrg	 *	123456789012345678901234567890
5535dfecf96Smrg	 *	The normal method would do 30*30=900 multiplications
5545dfecf96Smrg	 *	Karatsuba method would do 15*15*3=675 multiplications
5555dfecf96Smrg	 *	Toom method would do 10*10*5=500 multiplications
5565dfecf96Smrg	 * Toom method has a larger overhead if compared with Karatsuba method,
5575dfecf96Smrg	 * due to evaluation and interpolation, so it should be used for larger
5585dfecf96Smrg	 * numbers, so that the computation time of evaluation/interpolation
5595dfecf96Smrg	 * would be smaller than the time spent using other methods.
5605dfecf96Smrg	 *
5615dfecf96Smrg	 *	Note that Karatsuba method can be seen as a special case of
5625dfecf96Smrg	 * Toom method, i.e:
5635dfecf96Smrg	 *	U1U0 * V1V0
5645dfecf96Smrg	 *	with k = [ 0 1 oo ]
5655dfecf96Smrg	 *	U = [ U0 U1+U0 U1 ]
5665dfecf96Smrg	 *	V = [ V0 V1+V0 V1 ]
5675dfecf96Smrg	 *	W = [ U0*V0 (U1+U0)*(V1+V0) (U1+V1) ]
5685dfecf96Smrg	 *
5695dfecf96Smrg	 *	w0 = U0*V0
5705dfecf96Smrg	 *	w = (U1+U0)*(V1+V0)
5715dfecf96Smrg	 *	w2 = (U1*V1)
5725dfecf96Smrg	 *
5735dfecf96Smrg	 *	w1 = w - w0 - w2
5745dfecf96Smrg	 * w2x² + w1x + w0
5755dfecf96Smrg	 *
5765dfecf96Smrg	 *	See Knuth's Seminumerical Algorithms for a sample implemention
5775dfecf96Smrg	 * using 4 stacks and k = [ 0 1 2 3 ... ], based on the size of the
5785dfecf96Smrg	 * input.
5795dfecf96Smrg	 */
5805dfecf96Smrglong
5815dfecf96Smrgmp_toom_mul(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2)
5825dfecf96Smrg{
5835dfecf96Smrg    long size, xsize, i;
5845dfecf96Smrg    BNI value;				/* used in division */
5855dfecf96Smrg    BNS carry;
5865dfecf96Smrg    BNI x;				/* shift count */
5875dfecf96Smrg    BNI l1, l2;
5885dfecf96Smrg    BNI al, bl, cl, dl, el, Ul[3], Vl[3];
5895dfecf96Smrg    BNS *a, *b, *c, *d, *e, *U[3], *V[3];
5905dfecf96Smrg
5915dfecf96Smrg    /* x is the base i.e. 2^(BNSBITS*x) */
5925dfecf96Smrg    x = (len1 + len2 + 4) / 6;
5935dfecf96Smrg    l1 = len1 - (x << 1);	/* length of remaining piece of op1 */
5945dfecf96Smrg    l2 = len2 - (x << 1);	/* length of remaining piece of op2 */
5955dfecf96Smrg
5965dfecf96Smrg    /* allocate memory for storing U and V */
5975dfecf96Smrg    U[0] = mp_malloc(sizeof(BNS) * (x + 2));
5985dfecf96Smrg    V[0] = mp_malloc(sizeof(BNS) * (x + 2));
5995dfecf96Smrg    U[1] = mp_malloc(sizeof(BNS) * (x + 1));
6005dfecf96Smrg    V[1] = mp_malloc(sizeof(BNS) * (x + 1));
6015dfecf96Smrg    U[2] = mp_malloc(sizeof(BNS) * (x + 2));
6025dfecf96Smrg    V[2] = mp_malloc(sizeof(BNS) * (x + 2));
6035dfecf96Smrg
6045dfecf96Smrg	/* EVALUATE U AND V */
6055dfecf96Smrg
6065dfecf96Smrg    /* Numbers are in the format U2x²+U1x+U0 and V2x²+V1x+V0 */
6075dfecf96Smrg
6085dfecf96Smrg	/* U[0] = U2+U1*2+U0*4 */
6095dfecf96Smrg
6105dfecf96Smrg    /* store U1*2 in U[1], this value is used twice */
6115dfecf96Smrg    Ul[1] = mp_lshift(U[1], op1 + x, x, 1);
6125dfecf96Smrg
6135dfecf96Smrg    /* store U0*4 in U[0] */
6145dfecf96Smrg    Ul[0] = mp_lshift(U[0], op1, x, 2);
6155dfecf96Smrg    /* add U1*2 to U[0] */
6165dfecf96Smrg    Ul[0] = mp_add(U[0], U[0], U[1], Ul[0], Ul[1]);
6175dfecf96Smrg    /* add U2 to U[0] */
6185dfecf96Smrg    Ul[0] = mp_add(U[0], U[0], op1 + x + x, Ul[0], l1);
6195dfecf96Smrg
6205dfecf96Smrg	/* U[2] = U2*4+U1*2+U0 */
6215dfecf96Smrg
6225dfecf96Smrg    /* store U2*4 in U[2] */
6235dfecf96Smrg    Ul[2] = mp_lshift(U[2], op1 + x + x, l1, 2);
6245dfecf96Smrg    /* add U1*2 to U[2] */
6255dfecf96Smrg    Ul[2] = mp_add(U[2], U[2], U[1], Ul[2], Ul[1]);
6265dfecf96Smrg    /* add U0 to U[2] */
6275dfecf96Smrg    Ul[2] = mp_add(U[2], U[2], op1, Ul[2], x);
6285dfecf96Smrg
6295dfecf96Smrg	/* U[1] = U2+U1+U0 */
6305dfecf96Smrg
6315dfecf96Smrg    Ul[1] = mp_add(U[1], op1, op1 + x, x, x);
6325dfecf96Smrg    Ul[1] = mp_add(U[1], U[1], op1 + x + x, Ul[1], l1);
6335dfecf96Smrg
6345dfecf96Smrg
6355dfecf96Smrg	/* Evaluate V[x], same code as U[x] */
6365dfecf96Smrg    Vl[1] = mp_lshift(V[1], op2 + x, x, 1);
6375dfecf96Smrg    Vl[0] = mp_lshift(V[0], op2, x, 2);
6385dfecf96Smrg    Vl[0] = mp_add(V[0], V[0], V[1], Vl[0], Vl[1]);
6395dfecf96Smrg    Vl[0] = mp_add(V[0], V[0], op2 + x + x, Vl[0], l2);
6405dfecf96Smrg    Vl[2] = mp_lshift(V[2], op2 + x + x, l2, 2);
6415dfecf96Smrg    Vl[2] = mp_add(V[2], V[2], V[1], Vl[2], Vl[1]);
6425dfecf96Smrg    Vl[2] = mp_add(V[2], V[2], op2, Vl[2], x);
6435dfecf96Smrg    Vl[1] = mp_add(V[1], op2, op2 + x, x, x);
6445dfecf96Smrg    Vl[1] = mp_add(V[1], V[1], op2 + x + x, Vl[1], l2);
6455dfecf96Smrg
6465dfecf96Smrg
6475dfecf96Smrg	/* MULTIPLY U[] AND V[] */
6485dfecf96Smrg
6495dfecf96Smrg	/* calculate (U2+U1*2+U0*4) * (V2+V1*2+V0*4) */
6505dfecf96Smrg    b = mp_calloc(1, sizeof(BNS) * (Ul[0] * Vl[0]));
6515dfecf96Smrg    bl = mp_mul(b, U[0], V[0], Ul[0], Vl[0]);
6525dfecf96Smrg    mp_free(U[0]);
6535dfecf96Smrg    mp_free(V[0]);
6545dfecf96Smrg
6555dfecf96Smrg	/* calculate (U2+U1+U0) * (V2+V1+V0) */
6565dfecf96Smrg    c = mp_calloc(1, sizeof(BNS) * (Ul[1] * Vl[1]));
6575dfecf96Smrg    cl = mp_mul(c, U[1], V[1], Ul[1], Vl[1]);
6585dfecf96Smrg    mp_free(U[1]);
6595dfecf96Smrg    mp_free(V[1]);
6605dfecf96Smrg
6615dfecf96Smrg	/* calculate (U2*4+U1*2+U0) * (V2*4+V1*2+V0) */
6625dfecf96Smrg    d = mp_calloc(1, sizeof(BNS) * (Ul[2] * Vl[2]));
6635dfecf96Smrg    dl = mp_mul(d, U[2], V[2], Ul[2], Vl[2]);
6645dfecf96Smrg    mp_free(U[2]);
6655dfecf96Smrg    mp_free(V[2]);
6665dfecf96Smrg
6675dfecf96Smrg	/* calculate U0 * V0 */
6685dfecf96Smrg    a = mp_calloc(1, sizeof(BNS) * (x + x));
6695dfecf96Smrg    al = mp_mul(a, op1, op2, x, x);
6705dfecf96Smrg
6715dfecf96Smrg	/* calculate U2 * V2 */
6725dfecf96Smrg    e = mp_calloc(1, sizeof(BNS) * (l1 + l2));
6735dfecf96Smrg    el = mp_mul(e, op1 + x + x, op2 + x + x, l1, l2);
6745dfecf96Smrg
6755dfecf96Smrg
6765dfecf96Smrg	/* INTERPOLATE COEFFICIENTS */
6775dfecf96Smrg
6785dfecf96Smrg    /* b = b - 16a - e */
6795dfecf96Smrg    size = mp_lshift(rop, a, al, 4);
6805dfecf96Smrg    bl = mp_sub(b, b, rop, bl, size);
6815dfecf96Smrg    bl = mp_sub(b, b, e, bl, el);
6825dfecf96Smrg
6835dfecf96Smrg    /* c = c - a - e*/
6845dfecf96Smrg    cl = mp_sub(c, c, a, cl, al);
6855dfecf96Smrg    cl = mp_sub(c, c, e, cl, el);
6865dfecf96Smrg
6875dfecf96Smrg    /* d = d - a - 16e */
6885dfecf96Smrg    dl = mp_sub(d, d, a, dl, al);
6895dfecf96Smrg    size = mp_lshift(rop, e, el, 4);
6905dfecf96Smrg    dl = mp_sub(d, d, rop, dl, size);
6915dfecf96Smrg
6925dfecf96Smrg    /* w = (b + d) - 8c */
6935dfecf96Smrg    size = mp_add(rop, b, d, bl, dl);
6945dfecf96Smrg    xsize = mp_lshift(rop + size, c, cl, 3);	/* rop has enough storage */
6955dfecf96Smrg    size = mp_sub(rop, rop, rop + size, size, xsize);
6965dfecf96Smrg
6975dfecf96Smrg    /* w = 2c - w*/
6985dfecf96Smrg    xsize = mp_lshift(rop + size, c, cl, 1);
6995dfecf96Smrg    size = mp_sub(rop, rop + size, rop, xsize, size);
7005dfecf96Smrg
7015dfecf96Smrg    /* b = b/2 */
7025dfecf96Smrg    bl = mp_rshift(b, b, bl, 1);
7035dfecf96Smrg
7045dfecf96Smrg    /* b = b - c */
7055dfecf96Smrg    bl = mp_sub(b, b, c, bl, cl);
7065dfecf96Smrg
7075dfecf96Smrg    /* c = w / 2 */
7085dfecf96Smrg    cl = mp_rshift(c, rop, size, 1);
7095dfecf96Smrg
7105dfecf96Smrg    /* b = b - c */
7115dfecf96Smrg    bl = mp_sub(b, b, c, bl, cl);
7125dfecf96Smrg
7135dfecf96Smrg    /* b = b/3 */
7145dfecf96Smrg	/* maybe the most expensive calculation */
7155dfecf96Smrg    i = bl - 1;
7165dfecf96Smrg    value = b[i];
7175dfecf96Smrg    b[i] = value / 3;
7185dfecf96Smrg    for (--i; i >= 0; i--) {
7195dfecf96Smrg	carry = value % 3;
7205dfecf96Smrg	value = ((BNI)carry << BNSBITS) + b[i];
7215dfecf96Smrg	b[i] = (BNS)(value / 3);
7225dfecf96Smrg    }
7235dfecf96Smrg
7245dfecf96Smrg    /* d = d/2 */
7255dfecf96Smrg    dl = mp_rshift(d, d, dl, 1);
7265dfecf96Smrg
7275dfecf96Smrg    /* d = d - b - w */
7285dfecf96Smrg    dl = mp_sub(d, d, b, dl, bl);
7295dfecf96Smrg    dl = mp_sub(d, d, rop, dl, size);
7305dfecf96Smrg
7315dfecf96Smrg    /* d = d/4 */
7325dfecf96Smrg    dl = mp_rshift(d, d, dl, 2);
7335dfecf96Smrg
7345dfecf96Smrg
7355dfecf96Smrg	/* STORE RESULT IN ROP */
7365dfecf96Smrg    /* first clear memory used as temporary variable w and 8c */
7375dfecf96Smrg    memset(rop, '\0', sizeof(BNS) * (len1 + len2));
7385dfecf96Smrg
7395dfecf96Smrg    i = x * 4;
7405dfecf96Smrg    xsize = (len1 + len2) - i;
7415dfecf96Smrg    size = mp_add(rop + i, rop + i, e, xsize, el) + i;
7425dfecf96Smrg    i = x * 3;
7435dfecf96Smrg    xsize = size - i;
7445dfecf96Smrg    size = mp_add(rop + i, rop + i, d, xsize, dl) + i;
7455dfecf96Smrg    i = x * 2;
7465dfecf96Smrg    xsize = size - i;
7475dfecf96Smrg    size = mp_add(rop + i, rop + i, c, xsize, cl) + i;
7485dfecf96Smrg    i = x;
7495dfecf96Smrg    xsize = size - i;
7505dfecf96Smrg    size = mp_add(rop + i, rop + i, b, xsize, bl) + i;
7515dfecf96Smrg    size = mp_add(rop, rop, a, size, al);
7525dfecf96Smrg
7535dfecf96Smrg    mp_free(e);
7545dfecf96Smrg    mp_free(d);
7555dfecf96Smrg    mp_free(c);
7565dfecf96Smrg    mp_free(b);
7575dfecf96Smrg    mp_free(a);
7585dfecf96Smrg
7595dfecf96Smrg    if (size > 1 && rop[size - 1] == 0)
7605dfecf96Smrg	--size;
7615dfecf96Smrg
7625dfecf96Smrg    return (size);
7635dfecf96Smrg}
7645dfecf96Smrg
7655dfecf96Smrglong
7665dfecf96Smrgmp_mul(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2)
7675dfecf96Smrg{
7685dfecf96Smrg    if (len1 < len2)
7695dfecf96Smrg	MP_SWAP(op1, op2, len1, len2);
7705dfecf96Smrg
7715dfecf96Smrg    if (len1 < KARATSUBA || len2 < KARATSUBA)
7725dfecf96Smrg	return (mp_base_mul(rop, op1, op2, len1, len2));
7735dfecf96Smrg    else if (len1 < TOOM && len2 < TOOM && len2 > ((len1 + 1) >> 1))
7745dfecf96Smrg	return (mp_karatsuba_mul(rop, op1, op2, len1, len2));
7755dfecf96Smrg    else if (len1 >= TOOM && len2 >= TOOM && (len2 + 2) / 3 == (len1 + 2) / 3)
7765dfecf96Smrg	return (mp_toom_mul(rop, op1, op2, len1, len2));
7775dfecf96Smrg    else {
7785dfecf96Smrg	long xsize, psize, isize;
7795dfecf96Smrg	BNS *ptr;
7805dfecf96Smrg
7815dfecf96Smrg	/* adjust index pointer and estimated size of result */
7825dfecf96Smrg	isize = 0;
7835dfecf96Smrg	xsize = len1 + len2;
7845dfecf96Smrg	mp_mul(rop, op1, op2, len2, len2);
7855dfecf96Smrg	/* adjust pointers */
7865dfecf96Smrg	len1 -= len2;
7875dfecf96Smrg	op1 += len2;
7885dfecf96Smrg
7895dfecf96Smrg	/* allocate buffer for intermediate multiplications */
7905dfecf96Smrg	if (len1 > len2)
7915dfecf96Smrg	    ptr = mp_calloc(1, sizeof(BNS) * (len2 + len2));
7925dfecf96Smrg	else
7935dfecf96Smrg	    ptr = mp_calloc(1, sizeof(BNS) * (len1 + len2));
7945dfecf96Smrg
7955dfecf96Smrg	/* loop multiplying len2 size operands at a time */
7965dfecf96Smrg	while (len1 >= len2) {
7975dfecf96Smrg	    isize += len2;
7985dfecf96Smrg	    psize = mp_mul(ptr, op1, op2, len2, len2);
7995dfecf96Smrg	    mp_add(rop + isize, rop + isize, ptr, xsize - isize, psize);
8005dfecf96Smrg	    len1 -= len2;
8015dfecf96Smrg	    op1 += len2;
8025dfecf96Smrg
8035dfecf96Smrg	    /* multiplication routines require zeroed memory */
8045dfecf96Smrg	    memset(ptr, '\0', sizeof(BNS) * (MIN(len1, len2) + len2));
8055dfecf96Smrg	}
8065dfecf96Smrg
8075dfecf96Smrg	/* len1 was not a multiple of len2 */
8085dfecf96Smrg	if (len1) {
8095dfecf96Smrg	    isize += len2;
8105dfecf96Smrg	    psize = mp_mul(ptr, op2, op1, len2, len1);
8115dfecf96Smrg	    mp_add(rop + isize, rop + isize, ptr, xsize, psize);
8125dfecf96Smrg	}
8135dfecf96Smrg
8145dfecf96Smrg	/* adjust result size */
8155dfecf96Smrg	if (rop[xsize - 1] == 0)
8165dfecf96Smrg	    --xsize;
8175dfecf96Smrg
8185dfecf96Smrg	mp_free(ptr);
8195dfecf96Smrg
8205dfecf96Smrg	return (xsize);
8215dfecf96Smrg    }
8225dfecf96Smrg}
823