Home | History | Annotate | Line # | Download | only in mp
      1 /*
      2  * Copyright (c) 2002 by The XFree86 Project, Inc.
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice shall be included in
     12  * all copies or substantial portions of the Software.
     13  *
     14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     17  * THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
     18  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
     19  * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     20  * SOFTWARE.
     21  *
     22  * Except as contained in this notice, the name of the XFree86 Project shall
     23  * not be used in advertising or otherwise to promote the sale, use or other
     24  * dealings in this Software without prior written authorization from the
     25  * XFree86 Project.
     26  *
     27  * Author: Paulo Csar Pereira de Andrade
     28  */
     29 
     30 /* $XFree86: xc/programs/xedit/lisp/mp/mp.c,v 1.2 2002/11/08 08:01:00 paulo Exp $ */
     31 
     32 #include "mp.h"
     33 
     34 /*
     35  * TODO:
     36  *	o Optimize squaring
     37  *	o Write better division code and move from mpi.c to here
     38  *	o Make multiplication code don't required memory to be zeroed
     39  *		+ The first step is easy, just multiply the low word,
     40  *		then the high word, that may overlap with the result
     41  *		of the first multiply (in case of carry), and then
     42  *		just make sure carry is properly propagated in the
     43  *		subsequent multiplications.
     44  *		+ Some code needs also to be rewritten because some
     45  *		intermediate addition code in mp_mul, mp_karatsuba_mul,
     46  *		and mp_toom_mul is assuming the memory is zeroed.
     47  */
     48 
     49 /*
     50  * Prototypes
     51  */
     52 	/* out of memory handler */
     53 static void mp_outmem(void);
     54 
     55 	/* memory allocation fallback functions */
     56 static void *_mp_malloc(size_t);
     57 static void *_mp_calloc(size_t, size_t);
     58 static void *_mp_realloc(void*, size_t);
     59 static void _mp_free(void*);
     60 
     61 /*
     62  * Initialization
     63  */
     64 static mp_malloc_fun __mp_malloc = _mp_malloc;
     65 static mp_calloc_fun __mp_calloc = _mp_calloc;
     66 static mp_realloc_fun __mp_realloc = _mp_realloc;
     67 static mp_free_fun __mp_free = _mp_free;
     68 
     69 /*
     70  * Implementation
     71  */
     72 static void
     73 mp_outmem(void)
     74 {
     75     fprintf(stderr, "out of memory in MP library.\n");
     76     exit(1);
     77 }
     78 
     79 static void *
     80 _mp_malloc(size_t size)
     81 {
     82     return (malloc(size));
     83 }
     84 
     85 void *
     86 mp_malloc(size_t size)
     87 {
     88     void *pointer = (*__mp_malloc)(size);
     89 
     90     if (pointer == NULL)
     91 	mp_outmem();
     92 
     93     return (pointer);
     94 }
     95 
     96 mp_malloc_fun
     97 mp_set_malloc(mp_malloc_fun fun)
     98 {
     99     mp_malloc_fun old = __mp_malloc;
    100 
    101     __mp_malloc = fun;
    102 
    103     return (old);
    104 }
    105 
    106 static void *
    107 _mp_calloc(size_t nmemb, size_t size)
    108 {
    109     return (calloc(nmemb, size));
    110 }
    111 
    112 void *
    113 mp_calloc(size_t nmemb, size_t size)
    114 {
    115     void *pointer = (*__mp_calloc)(nmemb, size);
    116 
    117     if (pointer == NULL)
    118 	mp_outmem();
    119 
    120     return (pointer);
    121 }
    122 
    123 mp_calloc_fun
    124 mp_set_calloc(mp_calloc_fun fun)
    125 {
    126     mp_calloc_fun old = __mp_calloc;
    127 
    128     __mp_calloc = fun;
    129 
    130     return (old);
    131 }
    132 
    133 static void *
    134 _mp_realloc(void *old, size_t size)
    135 {
    136     return (realloc(old, size));
    137 }
    138 
    139 void *
    140 mp_realloc(void *old, size_t size)
    141 {
    142     void *pointer = (*__mp_realloc)(old, size);
    143 
    144     if (pointer == NULL)
    145 	mp_outmem();
    146 
    147     return (pointer);
    148 }
    149 
    150 mp_realloc_fun
    151 mp_set_realloc(mp_realloc_fun fun)
    152 {
    153     mp_realloc_fun old = __mp_realloc;
    154 
    155     __mp_realloc = fun;
    156 
    157     return (old);
    158 }
    159 
    160 static void
    161 _mp_free(void *pointer)
    162 {
    163     free(pointer);
    164 }
    165 
    166 void
    167 mp_free(void *pointer)
    168 {
    169     (*__mp_free)(pointer);
    170 }
    171 
    172 mp_free_fun
    173 mp_set_free(mp_free_fun fun)
    174 {
    175     mp_free_fun old = __mp_free;
    176 
    177     __mp_free = fun;
    178 
    179     return (old);
    180 }
    181 
    182 long
    183 mp_add(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2)
    184 {
    185     BNI value;			/* intermediate result */
    186     BNS carry;			/* carry flag */
    187     long size;			/* result size */
    188 
    189     if (len1 < len2)
    190 	MP_SWAP(op1, op2, len1, len2);
    191 
    192     /* unroll start of loop */
    193     value = (BNI)op1[0] + op2[0];
    194     rop[0] = value;
    195     carry = value >> BNSBITS;
    196 
    197     /* add op1 and op2 */
    198     for (size = 1; size < len2; size++) {
    199 	value = (BNI)op1[size] + op2[size] + carry;
    200 	rop[size] = value;
    201 	carry = value >> BNSBITS;
    202     }
    203     if (rop != op1) {
    204 	for (; size < len1; size++) {
    205 	    value = (BNI)op1[size] + carry;
    206 	    rop[size] = value;
    207 	    carry = value >> BNSBITS;
    208 	}
    209     }
    210     else {
    211 	/* if rop == op1, than just adjust carry */
    212 	for (; carry && size < len1; size++) {
    213 	    value = (BNI)op1[size] + carry;
    214 	    rop[size] = value;
    215 	    carry = value >> BNSBITS;
    216 	}
    217 	size = len1;
    218     }
    219     if (carry)
    220 	rop[size++] = carry;
    221 
    222     return (size);
    223 }
    224 
    225 long
    226 mp_sub(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2)
    227 {
    228     long svalue;		/* intermediate result */
    229     BNS carry;			/* carry flag */
    230     long size;			/* result size */
    231 
    232     /* special case */
    233     if (op1 == op2) {
    234 	rop[0] = 0;
    235 
    236 	return (1);
    237     }
    238 
    239     /* unroll start of loop */
    240     svalue = (long)op1[0] - op2[0];
    241     rop[0] = svalue;
    242     carry = svalue < 0;
    243 
    244     /* subtracts op2 from op1 */
    245     for (size = 1; size < len2; size++) {
    246 	svalue = (long)(op1[size]) - op2[size] - carry;
    247 	rop[size] = svalue;
    248 	carry = svalue < 0;
    249     }
    250     if (rop != op1) {
    251 	for (; size < len1; size++) {
    252 	    svalue = op1[size] - carry;
    253 	    rop[size] = svalue;
    254 	    carry = svalue < 0;
    255 	}
    256     }
    257     else {
    258 	/* if rop == op1, than just adjust carry */
    259 	for (; carry && size < len1; size++) {
    260 	    svalue = (long)op1[size] - carry;
    261 	    rop[size] = svalue;
    262 	    carry = svalue < 0;
    263 	}
    264 	size = len1;
    265     }
    266 
    267     /* calculate result size */
    268     while (size > 1 && rop[size - 1] == 0)
    269 	--size;
    270 
    271     return (size);
    272 }
    273 
    274 long
    275 mp_lshift(BNS *rop, BNS *op, BNI len, long shift)
    276 {
    277     long i, size;
    278     BNI words, bits;		/* how many word and bit shifts */
    279 
    280     words = shift / BNSBITS;
    281     bits = shift % BNSBITS;
    282     size = len + words;
    283 
    284     if (bits) {
    285 	BNS hi, lo;
    286 	BNI carry;
    287 	int adj;
    288 
    289 	for (i = 1, carry = CARRY >> 1; carry; i++, carry >>= 1)
    290 	    if (op[len - 1] & carry)
    291 		break;
    292 	adj = (bits + (BNSBITS - i)) / BNSBITS;
    293 	size += adj;
    294 
    295 	lo = hi = op[0];
    296 	rop[words] = lo << bits;
    297 	for (i = 1; i < len; i++) {
    298 	    hi = op[i];
    299 	    rop[words + i] = hi << bits | (lo >> (BNSBITS - bits));
    300 	    lo = hi;
    301 	}
    302 	if (adj)
    303 	    rop[size - 1] = hi >> (BNSBITS - bits);
    304     }
    305     else
    306 	memmove(rop + size - len, op, sizeof(BNS) * len);
    307 
    308     if (words)
    309 	memset(rop, '\0', sizeof(BNS) * words);
    310 
    311     return (size);
    312 }
    313 
    314 long
    315 mp_rshift(BNS *rop, BNS *op, BNI len, long shift)
    316 {
    317     int adj = 0;
    318     long i, size;
    319     BNI words, bits;		/* how many word and bit shifts */
    320 
    321     words = shift / BNSBITS;
    322     bits = shift % BNSBITS;
    323     size = len - words;
    324 
    325     if (bits) {
    326 	BNS hi, lo;
    327 	BNI carry;
    328 
    329 	for (i = 0, carry = CARRY >> 1; carry; i++, carry >>= 1)
    330 	    if (op[len - 1] & carry)
    331 		break;
    332 	adj = (bits + i) / BNSBITS;
    333 	if (size - adj == 0) {
    334 	    rop[0] = 0;
    335 
    336 	    return (1);
    337 	}
    338 
    339 	hi = lo = op[words + size - 1];
    340 	rop[size - 1] = hi >> bits;
    341 	for (i = size - 2; i >= 0; i--) {
    342 	    lo = op[words + i];
    343 	    rop[i] = (lo >> bits) | (hi << (BNSBITS - bits));
    344 	    hi = lo;
    345 	}
    346 	if (adj)
    347 	    rop[0] |= lo << (BNSBITS - bits);
    348     }
    349     else
    350 	memmove(rop, op + len - size, size * sizeof(BNS));
    351 
    352     return (size - adj);
    353 }
    354 
    355 	/* rop must be a pointer to len1 + len2 elements
    356 	 * rop cannot be either op1 or op2
    357 	 * rop must be all zeros */
    358 long
    359 mp_base_mul(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2)
    360 {
    361     long i, j;			/* counters */
    362     BNI value;			/* intermediate result */
    363     BNS carry;			/* carry value */
    364     long size = len1 + len2;
    365 
    366     /* simple optimization: first pass does not need to deference rop[i+j] */
    367     if (op1[0]) {
    368 	value = (BNI)(op1[0]) * op2[0];
    369 	rop[0] = value;
    370 	carry = (BNS)(value >> BNSBITS);
    371 	for (j = 1; j < len2; j++) {
    372 	    value = (BNI)(op1[0]) * op2[j] + carry;
    373 	    rop[j] = value;
    374 	    carry = (BNS)(value >> BNSBITS);
    375 	}
    376 	rop[j] = carry;
    377     }
    378 
    379     /* do the multiplication */
    380     for (i = 1; i < len1; i++) {
    381 	if (op1[i]) {
    382 	    /* unrool loop initialization */
    383 	    value = (BNI)(op1[i]) * op2[0] + rop[i];
    384 	    rop[i] = value;
    385 	    carry = (BNS)(value >> BNSBITS);
    386 	    /* multiply */
    387 	    for (j = 1; j < len2; j++) {
    388 		value = (BNI)(op1[i]) * op2[j] + rop[i + j] + carry;
    389 		rop[i + j] = value;
    390 		carry = (BNS)(value >> BNSBITS);
    391 	    }
    392 	    rop[i + j] = carry;
    393 	}
    394     }
    395 
    396     if (size > 1 && rop[size - 1] == 0)
    397 	--size;
    398 
    399     return (size);
    400 }
    401 
    402 	/* Karatsuba method
    403 	 * t + ((a0 + a1) (b0 + b1) - t - u) x + ux
    404 	 * where t = a0b0 and u = a1b1
    405 	 *
    406 	 * Karatsuba method reduces the number of multiplications. Example:
    407 	 *	Square a 40 length number
    408 	 *	instead of a plain 40*40 = 1600 multiplies/adds, it does:
    409 	 *	20*20+20*20+20*20 = 1200
    410 	 *	but since it is recursive, every 20*20=400 is reduced to
    411 	 *	10*10+10*10+10*10=300
    412 	 *	and so on.
    413 	 * The multiplication by x and x is a just a shift, as it is a
    414 	 * power of two, and is implemented below by just writting at the
    415 	 * correct offset */
    416 long
    417 mp_karatsuba_mul(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2)
    418 {
    419     BNI x;				/* shift count */
    420     BNI la0, la1, lb0, lb1;		/* length of a0, a1, b0, and b1 */
    421     BNS *t;				/* temporary memory for t product */
    422     BNS *u;				/* temporary memory for u product */
    423     BNS *r;				/* pointer to rop */
    424     long xlen, tlen, ulen;
    425 
    426     /* calculate value of x, that is 2^(BNSBITS*x) */
    427     if (len1 >= len2)
    428 	x = (len1 + 1) >> 1;
    429     else
    430 	x = (len2 + 1) >> 1;
    431 
    432     /* calculate length of operands */
    433     la0 = x;
    434     la1 = len1 - x;
    435     lb0 = x;
    436     lb1 = len2 - x;
    437 
    438     /* allocate buffer for t and (a0 + a1) */
    439     tlen = la0 + lb0;
    440     t = mp_malloc(sizeof(BNS) * tlen);
    441 
    442     /* allocate buffer for u and (b0 + b1) */
    443     if (la1 + lb1 < lb0 + lb1 + 1)
    444 	ulen = lb0 + lb1 + 1;
    445     else
    446 	ulen = la1 + lb1;
    447     u = mp_malloc(sizeof(BNS) * ulen);
    448 
    449     /* calculate a0 + a1, store result in t */
    450     tlen = mp_add(t, op1, op1 + x, la0, la1);
    451 
    452     /* calculate b0 + b1, store result in u */
    453     ulen = mp_add(u, op2, op2 + x, lb0, lb1);
    454 
    455     /* store (a0 + a1) * (b0 + b1) in rop */
    456 
    457     r = rop + x;		/* multiplied by 2^(BNSBITS*x) */
    458     xlen = mp_mul(r, t, u, tlen, ulen);
    459 
    460     /* must zero t and u memory, this is required for mp_mul */
    461 
    462     /* calculate t = a0 * b0 */
    463     tlen = la0 + lb0;
    464     memset(t, '\0', sizeof(BNS) * tlen);
    465     tlen = mp_mul(t, op1, op2, la0, lb0);
    466 
    467     /* calculate u = a1 * b1 */
    468     ulen = la1 + lb1;
    469     memset(u, '\0', sizeof(BNS) * ulen);
    470     ulen = mp_mul(u, op1 + x, op2 + x, la1, lb1);
    471 
    472     /* subtract t from partial result */
    473     xlen = mp_sub(r, r, t, xlen, tlen);
    474 
    475     /* subtract u form partial result */
    476     xlen = mp_sub(r, r, u, xlen, ulen);
    477 
    478     /* add ux^2 to partial result */
    479 
    480     r = rop + (x << 1);		/* multiplied by x^2 = 2^(BNSBITS*x*2) */
    481     xlen = len1 + len2;
    482     xlen = mp_add(r, r, u, xlen, ulen);
    483 
    484     /* now add t to final result */
    485     xlen = mp_add(rop, rop, t, xlen, tlen);
    486 
    487     mp_free(t);
    488     mp_free(u);
    489 
    490     if (xlen > 1 && rop[xlen - 1] == 0)
    491 	--xlen;
    492 
    493     return (xlen);
    494 }
    495 
    496 	/* Toom method	(partially based on GMP documentation)
    497 	 * Evaluation at k = [ 0 1/2 1 2 oo ]
    498 	 * U(x) = (U2k + U1)k + U0
    499 	 * V(x) = (V2k + V1)k + V0
    500 	 * W(x) = U(x)V(x)
    501 	 *
    502 	 * Sample:
    503 	 *	123 * 456
    504 	 *
    505 	 *	EVALUATION:
    506 	 * U(0) = (1*0+2)*0+3	=> 3
    507 	 * U(1) = 1+(2+3*2)*2	=> 17
    508 	 * U(2) = 1+2+3		=> 6
    509 	 * U(3) = (1*2+2)*2+3	=> 11
    510 	 * U(4)	= 1+(2+3*0)*0	=> 1
    511 	 *
    512 	 * V(0) = (4*0+5)*0+6	=> 6
    513 	 * V(1) = 4+(5+6*2)*2	=> 38
    514 	 * V(2) = 4+5+6		=> 15
    515 	 * V(3) = (4*2+5)*2+6	=> 32
    516 	 * V(4)	= 4+(5+6*0)*0	=> 4
    517 	 *
    518 	 *	U = [ 3   17  6  11 1 ]
    519 	 *	V = [ 6   38 15  32 4 ]
    520 	 *	W = [ 18 646 90 352 4 ]
    521 	 *
    522 	 * After that, we have:
    523 	 *	a = 18					(w0 already known)
    524 	 *	b = 16w0 + 8w1 + 4w2 + 2w3 +   w4
    525 	 *	c =   w0 +  w1 +  w2 +  w3 +   w4
    526 	 *	d =   w0 + 2w1 + 4w2 + 8w3 + 16w4
    527 	 *	e = 4					(w4 already known)
    528 	 *
    529 	 *	INTERPOLATION:
    530 	 *	b = b -16a - e		(354)
    531 	 *	c = c - a - e		(68)
    532 	 *	d = d - a - 16e		(270)
    533 	 *
    534 	 *	w = (b + d) - 8c = (10w1+8w2+10w3) - (8w1+8w2+8w3) = 2w1+2w3
    535 	 *	w = 2c - w		(56)
    536 	 *	b = b/2 = 4w1+w+w3
    537 	 *	b = b-c = 4w1+w+w3 - w1+w2+w3 = 3w1+w2
    538 	 *	c = w/2			(w2 = 28)
    539 	 *	b = b-c = 3w1+c - c = 3w1
    540 	 *	b = b/3			(w1 = 27)
    541 	 *	d = d/2
    542 	 *	d = d-b-w = b+w+4w3 - b-w = 4w3
    543 	 *	d = d/4			(w3 = 13)
    544 	 *
    545 	 *	RESULT:
    546 	 *	w4*10^4 + w3*10 + w2*10 + w1*10 + w0
    547 	 *	40000   + 13000   + 2800   + 270    + 18
    548 	 *	10 is the base where the calculation was done
    549 	 *
    550 	 *	This sample uses small numbers, so it does not show the
    551 	 * advantage of the method. But for example (in base 10), when squaring
    552 	 *	123456789012345678901234567890
    553 	 *	The normal method would do 30*30=900 multiplications
    554 	 *	Karatsuba method would do 15*15*3=675 multiplications
    555 	 *	Toom method would do 10*10*5=500 multiplications
    556 	 * Toom method has a larger overhead if compared with Karatsuba method,
    557 	 * due to evaluation and interpolation, so it should be used for larger
    558 	 * numbers, so that the computation time of evaluation/interpolation
    559 	 * would be smaller than the time spent using other methods.
    560 	 *
    561 	 *	Note that Karatsuba method can be seen as a special case of
    562 	 * Toom method, i.e:
    563 	 *	U1U0 * V1V0
    564 	 *	with k = [ 0 1 oo ]
    565 	 *	U = [ U0 U1+U0 U1 ]
    566 	 *	V = [ V0 V1+V0 V1 ]
    567 	 *	W = [ U0*V0 (U1+U0)*(V1+V0) (U1+V1) ]
    568 	 *
    569 	 *	w0 = U0*V0
    570 	 *	w = (U1+U0)*(V1+V0)
    571 	 *	w2 = (U1*V1)
    572 	 *
    573 	 *	w1 = w - w0 - w2
    574 	 * w2x + w1x + w0
    575 	 *
    576 	 *	See Knuth's Seminumerical Algorithms for a sample implemention
    577 	 * using 4 stacks and k = [ 0 1 2 3 ... ], based on the size of the
    578 	 * input.
    579 	 */
    580 long
    581 mp_toom_mul(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2)
    582 {
    583     long size, xsize, i;
    584     BNI value;				/* used in division */
    585     BNS carry;
    586     BNI x;				/* shift count */
    587     BNI l1, l2;
    588     BNI al, bl, cl, dl, el, Ul[3], Vl[3];
    589     BNS *a, *b, *c, *d, *e, *U[3], *V[3];
    590 
    591     /* x is the base i.e. 2^(BNSBITS*x) */
    592     x = (len1 + len2 + 4) / 6;
    593     l1 = len1 - (x << 1);	/* length of remaining piece of op1 */
    594     l2 = len2 - (x << 1);	/* length of remaining piece of op2 */
    595 
    596     /* allocate memory for storing U and V */
    597     U[0] = mp_malloc(sizeof(BNS) * (x + 2));
    598     V[0] = mp_malloc(sizeof(BNS) * (x + 2));
    599     U[1] = mp_malloc(sizeof(BNS) * (x + 1));
    600     V[1] = mp_malloc(sizeof(BNS) * (x + 1));
    601     U[2] = mp_malloc(sizeof(BNS) * (x + 2));
    602     V[2] = mp_malloc(sizeof(BNS) * (x + 2));
    603 
    604 	/* EVALUATE U AND V */
    605 
    606     /* Numbers are in the format U2x+U1x+U0 and V2x+V1x+V0 */
    607 
    608 	/* U[0] = U2+U1*2+U0*4 */
    609 
    610     /* store U1*2 in U[1], this value is used twice */
    611     Ul[1] = mp_lshift(U[1], op1 + x, x, 1);
    612 
    613     /* store U0*4 in U[0] */
    614     Ul[0] = mp_lshift(U[0], op1, x, 2);
    615     /* add U1*2 to U[0] */
    616     Ul[0] = mp_add(U[0], U[0], U[1], Ul[0], Ul[1]);
    617     /* add U2 to U[0] */
    618     Ul[0] = mp_add(U[0], U[0], op1 + x + x, Ul[0], l1);
    619 
    620 	/* U[2] = U2*4+U1*2+U0 */
    621 
    622     /* store U2*4 in U[2] */
    623     Ul[2] = mp_lshift(U[2], op1 + x + x, l1, 2);
    624     /* add U1*2 to U[2] */
    625     Ul[2] = mp_add(U[2], U[2], U[1], Ul[2], Ul[1]);
    626     /* add U0 to U[2] */
    627     Ul[2] = mp_add(U[2], U[2], op1, Ul[2], x);
    628 
    629 	/* U[1] = U2+U1+U0 */
    630 
    631     Ul[1] = mp_add(U[1], op1, op1 + x, x, x);
    632     Ul[1] = mp_add(U[1], U[1], op1 + x + x, Ul[1], l1);
    633 
    634 
    635 	/* Evaluate V[x], same code as U[x] */
    636     Vl[1] = mp_lshift(V[1], op2 + x, x, 1);
    637     Vl[0] = mp_lshift(V[0], op2, x, 2);
    638     Vl[0] = mp_add(V[0], V[0], V[1], Vl[0], Vl[1]);
    639     Vl[0] = mp_add(V[0], V[0], op2 + x + x, Vl[0], l2);
    640     Vl[2] = mp_lshift(V[2], op2 + x + x, l2, 2);
    641     Vl[2] = mp_add(V[2], V[2], V[1], Vl[2], Vl[1]);
    642     Vl[2] = mp_add(V[2], V[2], op2, Vl[2], x);
    643     Vl[1] = mp_add(V[1], op2, op2 + x, x, x);
    644     Vl[1] = mp_add(V[1], V[1], op2 + x + x, Vl[1], l2);
    645 
    646 
    647 	/* MULTIPLY U[] AND V[] */
    648 
    649 	/* calculate (U2+U1*2+U0*4) * (V2+V1*2+V0*4) */
    650     b = mp_calloc(1, sizeof(BNS) * (Ul[0] * Vl[0]));
    651     bl = mp_mul(b, U[0], V[0], Ul[0], Vl[0]);
    652     mp_free(U[0]);
    653     mp_free(V[0]);
    654 
    655 	/* calculate (U2+U1+U0) * (V2+V1+V0) */
    656     c = mp_calloc(1, sizeof(BNS) * (Ul[1] * Vl[1]));
    657     cl = mp_mul(c, U[1], V[1], Ul[1], Vl[1]);
    658     mp_free(U[1]);
    659     mp_free(V[1]);
    660 
    661 	/* calculate (U2*4+U1*2+U0) * (V2*4+V1*2+V0) */
    662     d = mp_calloc(1, sizeof(BNS) * (Ul[2] * Vl[2]));
    663     dl = mp_mul(d, U[2], V[2], Ul[2], Vl[2]);
    664     mp_free(U[2]);
    665     mp_free(V[2]);
    666 
    667 	/* calculate U0 * V0 */
    668     a = mp_calloc(1, sizeof(BNS) * (x + x));
    669     al = mp_mul(a, op1, op2, x, x);
    670 
    671 	/* calculate U2 * V2 */
    672     e = mp_calloc(1, sizeof(BNS) * (l1 + l2));
    673     el = mp_mul(e, op1 + x + x, op2 + x + x, l1, l2);
    674 
    675 
    676 	/* INTERPOLATE COEFFICIENTS */
    677 
    678     /* b = b - 16a - e */
    679     size = mp_lshift(rop, a, al, 4);
    680     bl = mp_sub(b, b, rop, bl, size);
    681     bl = mp_sub(b, b, e, bl, el);
    682 
    683     /* c = c - a - e*/
    684     cl = mp_sub(c, c, a, cl, al);
    685     cl = mp_sub(c, c, e, cl, el);
    686 
    687     /* d = d - a - 16e */
    688     dl = mp_sub(d, d, a, dl, al);
    689     size = mp_lshift(rop, e, el, 4);
    690     dl = mp_sub(d, d, rop, dl, size);
    691 
    692     /* w = (b + d) - 8c */
    693     size = mp_add(rop, b, d, bl, dl);
    694     xsize = mp_lshift(rop + size, c, cl, 3);	/* rop has enough storage */
    695     size = mp_sub(rop, rop, rop + size, size, xsize);
    696 
    697     /* w = 2c - w*/
    698     xsize = mp_lshift(rop + size, c, cl, 1);
    699     size = mp_sub(rop, rop + size, rop, xsize, size);
    700 
    701     /* b = b/2 */
    702     bl = mp_rshift(b, b, bl, 1);
    703 
    704     /* b = b - c */
    705     bl = mp_sub(b, b, c, bl, cl);
    706 
    707     /* c = w / 2 */
    708     cl = mp_rshift(c, rop, size, 1);
    709 
    710     /* b = b - c */
    711     bl = mp_sub(b, b, c, bl, cl);
    712 
    713     /* b = b/3 */
    714 	/* maybe the most expensive calculation */
    715     i = bl - 1;
    716     value = b[i];
    717     b[i] = value / 3;
    718     for (--i; i >= 0; i--) {
    719 	carry = value % 3;
    720 	value = ((BNI)carry << BNSBITS) + b[i];
    721 	b[i] = (BNS)(value / 3);
    722     }
    723 
    724     /* d = d/2 */
    725     dl = mp_rshift(d, d, dl, 1);
    726 
    727     /* d = d - b - w */
    728     dl = mp_sub(d, d, b, dl, bl);
    729     dl = mp_sub(d, d, rop, dl, size);
    730 
    731     /* d = d/4 */
    732     dl = mp_rshift(d, d, dl, 2);
    733 
    734 
    735 	/* STORE RESULT IN ROP */
    736     /* first clear memory used as temporary variable w and 8c */
    737     memset(rop, '\0', sizeof(BNS) * (len1 + len2));
    738 
    739     i = x * 4;
    740     xsize = (len1 + len2) - i;
    741     size = mp_add(rop + i, rop + i, e, xsize, el) + i;
    742     i = x * 3;
    743     xsize = size - i;
    744     size = mp_add(rop + i, rop + i, d, xsize, dl) + i;
    745     i = x * 2;
    746     xsize = size - i;
    747     size = mp_add(rop + i, rop + i, c, xsize, cl) + i;
    748     i = x;
    749     xsize = size - i;
    750     size = mp_add(rop + i, rop + i, b, xsize, bl) + i;
    751     size = mp_add(rop, rop, a, size, al);
    752 
    753     mp_free(e);
    754     mp_free(d);
    755     mp_free(c);
    756     mp_free(b);
    757     mp_free(a);
    758 
    759     if (size > 1 && rop[size - 1] == 0)
    760 	--size;
    761 
    762     return (size);
    763 }
    764 
    765 long
    766 mp_mul(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2)
    767 {
    768     if (len1 < len2)
    769 	MP_SWAP(op1, op2, len1, len2);
    770 
    771     if (len1 < KARATSUBA || len2 < KARATSUBA)
    772 	return (mp_base_mul(rop, op1, op2, len1, len2));
    773     else if (len1 < TOOM && len2 < TOOM && len2 > ((len1 + 1) >> 1))
    774 	return (mp_karatsuba_mul(rop, op1, op2, len1, len2));
    775     else if (len1 >= TOOM && len2 >= TOOM && (len2 + 2) / 3 == (len1 + 2) / 3)
    776 	return (mp_toom_mul(rop, op1, op2, len1, len2));
    777     else {
    778 	long xsize, psize, isize;
    779 	BNS *ptr;
    780 
    781 	/* adjust index pointer and estimated size of result */
    782 	isize = 0;
    783 	xsize = len1 + len2;
    784 	mp_mul(rop, op1, op2, len2, len2);
    785 	/* adjust pointers */
    786 	len1 -= len2;
    787 	op1 += len2;
    788 
    789 	/* allocate buffer for intermediate multiplications */
    790 	if (len1 > len2)
    791 	    ptr = mp_calloc(1, sizeof(BNS) * (len2 + len2));
    792 	else
    793 	    ptr = mp_calloc(1, sizeof(BNS) * (len1 + len2));
    794 
    795 	/* loop multiplying len2 size operands at a time */
    796 	while (len1 >= len2) {
    797 	    isize += len2;
    798 	    psize = mp_mul(ptr, op1, op2, len2, len2);
    799 	    mp_add(rop + isize, rop + isize, ptr, xsize - isize, psize);
    800 	    len1 -= len2;
    801 	    op1 += len2;
    802 
    803 	    /* multiplication routines require zeroed memory */
    804 	    memset(ptr, '\0', sizeof(BNS) * (MIN(len1, len2) + len2));
    805 	}
    806 
    807 	/* len1 was not a multiple of len2 */
    808 	if (len1) {
    809 	    isize += len2;
    810 	    psize = mp_mul(ptr, op2, op1, len2, len1);
    811 	    mp_add(rop + isize, rop + isize, ptr, xsize, psize);
    812 	}
    813 
    814 	/* adjust result size */
    815 	if (rop[xsize - 1] == 0)
    816 	    --xsize;
    817 
    818 	mp_free(ptr);
    819 
    820 	return (xsize);
    821     }
    822 }
    823