mp.c revision 31de2854
1/*
2 * Copyright (c) 2002 by The XFree86 Project, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 *
22 * Except as contained in this notice, the name of the XFree86 Project shall
23 * not be used in advertising or otherwise to promote the sale, use or other
24 * dealings in this Software without prior written authorization from the
25 * XFree86 Project.
26 *
27 * Author: Paulo César Pereira de Andrade
28 */
29
30/* $XFree86: xc/programs/xedit/lisp/mp/mp.c,v 1.2 2002/11/08 08:01:00 paulo Exp $ */
31
32#include "mp.h"
33
34/*
35 * TODO:
36 *	o Optimize squaring
37 *	o Write better division code and move from mpi.c to here
38 *	o Make multiplication code don't required memory to be zeroed
39 *		+ The first step is easy, just multiply the low word,
40 *		then the high word, that may overlap with the result
41 *		of the first multiply (in case of carry), and then
42 *		just make sure carry is properly propagated in the
43 *		subsequent multiplications.
44 *		+ Some code needs also to be rewritten because some
45 *		intermediate addition code in mp_mul, mp_karatsuba_mul,
46 *		and mp_toom_mul is assuming the memory is zeroed.
47 */
48
49/*
50 * Prototypes
51 */
52	/* out of memory handler */
53static void mp_outmem(void);
54
55	/* memory allocation fallback functions */
56static void *_mp_malloc(size_t);
57static void *_mp_calloc(size_t, size_t);
58static void *_mp_realloc(void*, size_t);
59static void _mp_free(void*);
60
61/*
62 * Initialization
63 */
64static mp_malloc_fun __mp_malloc = _mp_malloc;
65static mp_calloc_fun __mp_calloc = _mp_calloc;
66static mp_realloc_fun __mp_realloc = _mp_realloc;
67static mp_free_fun __mp_free = _mp_free;
68
69/*
70 * Implementation
71 */
72static void
73mp_outmem(void)
74{
75    fprintf(stderr, "out of memory in MP library.\n");
76    exit(1);
77}
78
79static void *
80_mp_malloc(size_t size)
81{
82    return (malloc(size));
83}
84
85void *
86mp_malloc(size_t size)
87{
88    void *pointer = (*__mp_malloc)(size);
89
90    if (pointer == NULL)
91	mp_outmem();
92
93    return (pointer);
94}
95
96mp_malloc_fun
97mp_set_malloc(mp_malloc_fun fun)
98{
99    mp_malloc_fun old = __mp_malloc;
100
101    __mp_malloc = fun;
102
103    return (old);
104}
105
106static void *
107_mp_calloc(size_t nmemb, size_t size)
108{
109    return (calloc(nmemb, size));
110}
111
112void *
113mp_calloc(size_t nmemb, size_t size)
114{
115    void *pointer = (*__mp_calloc)(nmemb, size);
116
117    if (pointer == NULL)
118	mp_outmem();
119
120    return (pointer);
121}
122
123mp_calloc_fun
124mp_set_calloc(mp_calloc_fun fun)
125{
126    mp_calloc_fun old = __mp_calloc;
127
128    __mp_calloc = fun;
129
130    return (old);
131}
132
133static void *
134_mp_realloc(void *old, size_t size)
135{
136    return (realloc(old, size));
137}
138
139void *
140mp_realloc(void *old, size_t size)
141{
142    void *pointer = (*__mp_realloc)(old, size);
143
144    if (pointer == NULL)
145	mp_outmem();
146
147    return (pointer);
148}
149
150mp_realloc_fun
151mp_set_realloc(mp_realloc_fun fun)
152{
153    mp_realloc_fun old = __mp_realloc;
154
155    __mp_realloc = fun;
156
157    return (old);
158}
159
160static void
161_mp_free(void *pointer)
162{
163    free(pointer);
164}
165
166void
167mp_free(void *pointer)
168{
169    (*__mp_free)(pointer);
170}
171
172mp_free_fun
173mp_set_free(mp_free_fun fun)
174{
175    mp_free_fun old = __mp_free;
176
177    __mp_free = fun;
178
179    return (old);
180}
181
182long
183mp_add(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2)
184{
185    BNI value;			/* intermediate result */
186    BNS carry;			/* carry flag */
187    long size;			/* result size */
188
189    if (len1 < len2)
190	MP_SWAP(op1, op2, len1, len2);
191
192    /* unroll start of loop */
193    value = (BNI)op1[0] + op2[0];
194    rop[0] = value;
195    carry = value >> BNSBITS;
196
197    /* add op1 and op2 */
198    for (size = 1; size < len2; size++) {
199	value = (BNI)op1[size] + op2[size] + carry;
200	rop[size] = value;
201	carry = value >> BNSBITS;
202    }
203    if (rop != op1) {
204	for (; size < len1; size++) {
205	    value = (BNI)op1[size] + carry;
206	    rop[size] = value;
207	    carry = value >> BNSBITS;
208	}
209    }
210    else {
211	/* if rop == op1, than just adjust carry */
212	for (; carry && size < len1; size++) {
213	    value = (BNI)op1[size] + carry;
214	    rop[size] = value;
215	    carry = value >> BNSBITS;
216	}
217	size = len1;
218    }
219    if (carry)
220	rop[size++] = carry;
221
222    return (size);
223}
224
225long
226mp_sub(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2)
227{
228    long svalue;		/* intermediate result */
229    BNS carry;			/* carry flag */
230    long size;			/* result size */
231
232    /* special case */
233    if (op1 == op2) {
234	rop[0] = 0;
235
236	return (1);
237    }
238
239    /* unroll start of loop */
240    svalue = (long)op1[0] - op2[0];
241    rop[0] = svalue;
242    carry = svalue < 0;
243
244    /* subtracts op2 from op1 */
245    for (size = 1; size < len2; size++) {
246	svalue = (long)(op1[size]) - op2[size] - carry;
247	rop[size] = svalue;
248	carry = svalue < 0;
249    }
250    if (rop != op1) {
251	for (; size < len1; size++) {
252	    svalue = op1[size] - carry;
253	    rop[size] = svalue;
254	    carry = svalue < 0;
255	}
256    }
257    else {
258	/* if rop == op1, than just adjust carry */
259	for (; carry && size < len1; size++) {
260	    svalue = (long)op1[size] - carry;
261	    rop[size] = svalue;
262	    carry = svalue < 0;
263	}
264	size = len1;
265    }
266
267    /* calculate result size */
268    while (size > 1 && rop[size - 1] == 0)
269	--size;
270
271    return (size);
272}
273
274long
275mp_lshift(BNS *rop, BNS *op, BNI len, long shift)
276{
277    long i, size;
278    BNI words, bits;		/* how many word and bit shifts */
279
280    words = shift / BNSBITS;
281    bits = shift % BNSBITS;
282    size = len + words;
283
284    if (bits) {
285	BNS hi, lo;
286	BNI carry;
287	int adj;
288
289	for (i = 1, carry = CARRY >> 1; carry; i++, carry >>= 1)
290	    if (op[len - 1] & carry)
291		break;
292	adj = (bits + (BNSBITS - i)) / BNSBITS;
293	size += adj;
294
295	lo = hi = op[0];
296	rop[words] = lo << bits;
297	for (i = 1; i < len; i++) {
298	    hi = op[i];
299	    rop[words + i] = hi << bits | (lo >> (BNSBITS - bits));
300	    lo = hi;
301	}
302	if (adj)
303	    rop[size - 1] = hi >> (BNSBITS - bits);
304    }
305    else
306	memmove(rop + size - len, op, sizeof(BNS) * len);
307
308    if (words)
309	memset(rop, '\0', sizeof(BNS) * words);
310
311    return (size);
312}
313
314long
315mp_rshift(BNS *rop, BNS *op, BNI len, long shift)
316{
317    int adj = 0;
318    long i, size;
319    BNI words, bits;		/* how many word and bit shifts */
320
321    words = shift / BNSBITS;
322    bits = shift % BNSBITS;
323    size = len - words;
324
325    if (bits) {
326	BNS hi, lo;
327	BNI carry;
328
329	for (i = 0, carry = CARRY >> 1; carry; i++, carry >>= 1)
330	    if (op[len - 1] & carry)
331		break;
332	adj = (bits + i) / BNSBITS;
333	if (size - adj == 0) {
334	    rop[0] = 0;
335
336	    return (1);
337	}
338
339	hi = lo = op[words + size - 1];
340	rop[size - 1] = hi >> bits;
341	for (i = size - 2; i >= 0; i--) {
342	    lo = op[words + i];
343	    rop[i] = (lo >> bits) | (hi << (BNSBITS - bits));
344	    hi = lo;
345	}
346	if (adj)
347	    rop[0] |= lo << (BNSBITS - bits);
348    }
349    else
350	memmove(rop, op + len - size, size * sizeof(BNS));
351
352    return (size - adj);
353}
354
355	/* rop must be a pointer to len1 + len2 elements
356	 * rop cannot be either op1 or op2
357	 * rop must be all zeros */
358long
359mp_base_mul(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2)
360{
361    long i, j;			/* counters */
362    BNI value;			/* intermediate result */
363    BNS carry;			/* carry value */
364    long size = len1 + len2;
365
366    /* simple optimization: first pass does not need to deference rop[i+j] */
367    if (op1[0]) {
368	value = (BNI)(op1[0]) * op2[0];
369	rop[0] = value;
370	carry = (BNS)(value >> BNSBITS);
371	for (j = 1; j < len2; j++) {
372	    value = (BNI)(op1[0]) * op2[j] + carry;
373	    rop[j] = value;
374	    carry = (BNS)(value >> BNSBITS);
375	}
376	rop[j] = carry;
377    }
378
379    /* do the multiplication */
380    for (i = 1; i < len1; i++) {
381	if (op1[i]) {
382	    /* unrool loop initialization */
383	    value = (BNI)(op1[i]) * op2[0] + rop[i];
384	    rop[i] = value;
385	    carry = (BNS)(value >> BNSBITS);
386	    /* multiply */
387	    for (j = 1; j < len2; j++) {
388		value = (BNI)(op1[i]) * op2[j] + rop[i + j] + carry;
389		rop[i + j] = value;
390		carry = (BNS)(value >> BNSBITS);
391	    }
392	    rop[i + j] = carry;
393	}
394    }
395
396    if (size > 1 && rop[size - 1] == 0)
397	--size;
398
399    return (size);
400}
401
402	/* Karatsuba method
403	 * t + ((a0 + a1) (b0 + b1) - t - u) x + ux²
404	 * where t = a0b0 and u = a1b1
405	 *
406	 * Karatsuba method reduces the number of multiplications. Example:
407	 *	Square a 40 length number
408	 *	instead of a plain 40*40 = 1600 multiplies/adds, it does:
409	 *	20*20+20*20+20*20 = 1200
410	 *	but since it is recursive, every 20*20=400 is reduced to
411	 *	10*10+10*10+10*10=300
412	 *	and so on.
413	 * The multiplication by x and x² is a just a shift, as it is a
414	 * power of two, and is implemented below by just writting at the
415	 * correct offset */
416long
417mp_karatsuba_mul(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2)
418{
419    BNI x;				/* shift count */
420    BNI la0, la1, lb0, lb1;		/* length of a0, a1, b0, and b1 */
421    BNS *t;				/* temporary memory for t product */
422    BNS *u;				/* temporary memory for u product */
423    BNS *r;				/* pointer to rop */
424    long xlen, tlen, ulen;
425
426    /* calculate value of x, that is 2^(BNSBITS*x) */
427    if (len1 >= len2)
428	x = (len1 + 1) >> 1;
429    else
430	x = (len2 + 1) >> 1;
431
432    /* calculate length of operands */
433    la0 = x;
434    la1 = len1 - x;
435    lb0 = x;
436    lb1 = len2 - x;
437
438    /* allocate buffer for t and (a0 + a1) */
439    tlen = la0 + lb0;
440    t = mp_malloc(sizeof(BNS) * tlen);
441
442    /* allocate buffer for u and (b0 + b1) */
443    if (la1 + lb1 < lb0 + lb1 + 1)
444	ulen = lb0 + lb1 + 1;
445    else
446	ulen = la1 + lb1;
447    u = mp_malloc(sizeof(BNS) * ulen);
448
449    /* calculate a0 + a1, store result in t */
450    tlen = mp_add(t, op1, op1 + x, la0, la1);
451
452    /* calculate b0 + b1, store result in u */
453    ulen = mp_add(u, op2, op2 + x, lb0, lb1);
454
455    /* store (a0 + a1) * (b0 + b1) in rop */
456
457    r = rop + x;		/* multiplied by 2^(BNSBITS*x) */
458    xlen = mp_mul(r, t, u, tlen, ulen);
459
460    /* must zero t and u memory, this is required for mp_mul */
461
462    /* calculate t = a0 * b0 */
463    tlen = la0 + lb0;
464    memset(t, '\0', sizeof(BNS) * tlen);
465    tlen = mp_mul(t, op1, op2, la0, lb0);
466
467    /* calculate u = a1 * b1 */
468    ulen = la1 + lb1;
469    memset(u, '\0', sizeof(BNS) * ulen);
470    ulen = mp_mul(u, op1 + x, op2 + x, la1, lb1);
471
472    /* subtract t from partial result */
473    xlen = mp_sub(r, r, t, xlen, tlen);
474
475    /* subtract u form partial result */
476    xlen = mp_sub(r, r, u, xlen, ulen);
477
478    /* add ux^2 to partial result */
479
480    r = rop + (x << 1);		/* multiplied by x^2 = 2^(BNSBITS*x*2) */
481    xlen = len1 + len2;
482    xlen = mp_add(r, r, u, xlen, ulen);
483
484    /* now add t to final result */
485    xlen = mp_add(rop, rop, t, xlen, tlen);
486
487    mp_free(t);
488    mp_free(u);
489
490    if (xlen > 1 && rop[xlen - 1] == 0)
491	--xlen;
492
493    return (xlen);
494}
495
496	/* Toom method	(partially based on GMP documentation)
497	 * Evaluation at k = [ 0 1/2 1 2 oo ]
498	 * U(x) = (U2k + U1)k + U0
499	 * V(x) = (V2k + V1)k + V0
500	 * W(x) = U(x)V(x)
501	 *
502	 * Sample:
503	 *	123 * 456
504	 *
505	 *	EVALUATION:
506	 * U(0) = (1*0+2)*0+3	=> 3
507	 * U(1) = 1+(2+3*2)*2	=> 17
508	 * U(2) = 1+2+3		=> 6
509	 * U(3) = (1*2+2)*2+3	=> 11
510	 * U(4)	= 1+(2+3*0)*0	=> 1
511	 *
512	 * V(0) = (4*0+5)*0+6	=> 6
513	 * V(1) = 4+(5+6*2)*2	=> 38
514	 * V(2) = 4+5+6		=> 15
515	 * V(3) = (4*2+5)*2+6	=> 32
516	 * V(4)	= 4+(5+6*0)*0	=> 4
517	 *
518	 *	U = [ 3   17  6  11 1 ]
519	 *	V = [ 6   38 15  32 4 ]
520	 *	W = [ 18 646 90 352 4 ]
521	 *
522	 * After that, we have:
523	 *	a = 18					(w0 already known)
524	 *	b = 16w0 + 8w1 + 4w2 + 2w3 +   w4
525	 *	c =   w0 +  w1 +  w2 +  w3 +   w4
526	 *	d =   w0 + 2w1 + 4w2 + 8w3 + 16w4
527	 *	e = 4					(w4 already known)
528	 *
529	 *	INTERPOLATION:
530	 *	b = b -16a - e		(354)
531	 *	c = c - a - e		(68)
532	 *	d = d - a - 16e		(270)
533	 *
534	 *	w = (b + d) - 8c = (10w1+8w2+10w3) - (8w1+8w2+8w3) = 2w1+2w3
535	 *	w = 2c - w		(56)
536	 *	b = b/2 = 4w1+w+w3
537	 *	b = b-c = 4w1+w+w3 - w1+w2+w3 = 3w1+w2
538	 *	c = w/2			(w2 = 28)
539	 *	b = b-c = 3w1+c - c = 3w1
540	 *	b = b/3			(w1 = 27)
541	 *	d = d/2
542	 *	d = d-b-w = b+w+4w3 - b-w = 4w3
543	 *	d = d/4			(w3 = 13)
544	 *
545	 *	RESULT:
546	 *	w4*10^4 + w3*10³ + w2*10² + w1*10 + w0
547	 *	40000   + 13000   + 2800   + 270    + 18
548	 *	10 is the base where the calculation was done
549	 *
550	 *	This sample uses small numbers, so it does not show the
551	 * advantage of the method. But for example (in base 10), when squaring
552	 *	123456789012345678901234567890
553	 *	The normal method would do 30*30=900 multiplications
554	 *	Karatsuba method would do 15*15*3=675 multiplications
555	 *	Toom method would do 10*10*5=500 multiplications
556	 * Toom method has a larger overhead if compared with Karatsuba method,
557	 * due to evaluation and interpolation, so it should be used for larger
558	 * numbers, so that the computation time of evaluation/interpolation
559	 * would be smaller than the time spent using other methods.
560	 *
561	 *	Note that Karatsuba method can be seen as a special case of
562	 * Toom method, i.e:
563	 *	U1U0 * V1V0
564	 *	with k = [ 0 1 oo ]
565	 *	U = [ U0 U1+U0 U1 ]
566	 *	V = [ V0 V1+V0 V1 ]
567	 *	W = [ U0*V0 (U1+U0)*(V1+V0) (U1+V1) ]
568	 *
569	 *	w0 = U0*V0
570	 *	w = (U1+U0)*(V1+V0)
571	 *	w2 = (U1*V1)
572	 *
573	 *	w1 = w - w0 - w2
574	 * w2x² + w1x + w0
575	 *
576	 *	See Knuth's Seminumerical Algorithms for a sample implemention
577	 * using 4 stacks and k = [ 0 1 2 3 ... ], based on the size of the
578	 * input.
579	 */
580long
581mp_toom_mul(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2)
582{
583    long size, xsize, i;
584    BNI value;				/* used in division */
585    BNS carry;
586    BNI x;				/* shift count */
587    BNI l1, l2;
588    BNI al, bl, cl, dl, el, Ul[3], Vl[3];
589    BNS *a, *b, *c, *d, *e, *U[3], *V[3];
590
591    /* x is the base i.e. 2^(BNSBITS*x) */
592    x = (len1 + len2 + 4) / 6;
593    l1 = len1 - (x << 1);	/* length of remaining piece of op1 */
594    l2 = len2 - (x << 1);	/* length of remaining piece of op2 */
595
596    /* allocate memory for storing U and V */
597    U[0] = mp_malloc(sizeof(BNS) * (x + 2));
598    V[0] = mp_malloc(sizeof(BNS) * (x + 2));
599    U[1] = mp_malloc(sizeof(BNS) * (x + 1));
600    V[1] = mp_malloc(sizeof(BNS) * (x + 1));
601    U[2] = mp_malloc(sizeof(BNS) * (x + 2));
602    V[2] = mp_malloc(sizeof(BNS) * (x + 2));
603
604	/* EVALUATE U AND V */
605
606    /* Numbers are in the format U2x²+U1x+U0 and V2x²+V1x+V0 */
607
608	/* U[0] = U2+U1*2+U0*4 */
609
610    /* store U1*2 in U[1], this value is used twice */
611    Ul[1] = mp_lshift(U[1], op1 + x, x, 1);
612
613    /* store U0*4 in U[0] */
614    Ul[0] = mp_lshift(U[0], op1, x, 2);
615    /* add U1*2 to U[0] */
616    Ul[0] = mp_add(U[0], U[0], U[1], Ul[0], Ul[1]);
617    /* add U2 to U[0] */
618    Ul[0] = mp_add(U[0], U[0], op1 + x + x, Ul[0], l1);
619
620	/* U[2] = U2*4+U1*2+U0 */
621
622    /* store U2*4 in U[2] */
623    Ul[2] = mp_lshift(U[2], op1 + x + x, l1, 2);
624    /* add U1*2 to U[2] */
625    Ul[2] = mp_add(U[2], U[2], U[1], Ul[2], Ul[1]);
626    /* add U0 to U[2] */
627    Ul[2] = mp_add(U[2], U[2], op1, Ul[2], x);
628
629	/* U[1] = U2+U1+U0 */
630
631    Ul[1] = mp_add(U[1], op1, op1 + x, x, x);
632    Ul[1] = mp_add(U[1], U[1], op1 + x + x, Ul[1], l1);
633
634
635	/* Evaluate V[x], same code as U[x] */
636    Vl[1] = mp_lshift(V[1], op2 + x, x, 1);
637    Vl[0] = mp_lshift(V[0], op2, x, 2);
638    Vl[0] = mp_add(V[0], V[0], V[1], Vl[0], Vl[1]);
639    Vl[0] = mp_add(V[0], V[0], op2 + x + x, Vl[0], l2);
640    Vl[2] = mp_lshift(V[2], op2 + x + x, l2, 2);
641    Vl[2] = mp_add(V[2], V[2], V[1], Vl[2], Vl[1]);
642    Vl[2] = mp_add(V[2], V[2], op2, Vl[2], x);
643    Vl[1] = mp_add(V[1], op2, op2 + x, x, x);
644    Vl[1] = mp_add(V[1], V[1], op2 + x + x, Vl[1], l2);
645
646
647	/* MULTIPLY U[] AND V[] */
648
649	/* calculate (U2+U1*2+U0*4) * (V2+V1*2+V0*4) */
650    b = mp_calloc(1, sizeof(BNS) * (Ul[0] * Vl[0]));
651    bl = mp_mul(b, U[0], V[0], Ul[0], Vl[0]);
652    mp_free(U[0]);
653    mp_free(V[0]);
654
655	/* calculate (U2+U1+U0) * (V2+V1+V0) */
656    c = mp_calloc(1, sizeof(BNS) * (Ul[1] * Vl[1]));
657    cl = mp_mul(c, U[1], V[1], Ul[1], Vl[1]);
658    mp_free(U[1]);
659    mp_free(V[1]);
660
661	/* calculate (U2*4+U1*2+U0) * (V2*4+V1*2+V0) */
662    d = mp_calloc(1, sizeof(BNS) * (Ul[2] * Vl[2]));
663    dl = mp_mul(d, U[2], V[2], Ul[2], Vl[2]);
664    mp_free(U[2]);
665    mp_free(V[2]);
666
667	/* calculate U0 * V0 */
668    a = mp_calloc(1, sizeof(BNS) * (x + x));
669    al = mp_mul(a, op1, op2, x, x);
670
671	/* calculate U2 * V2 */
672    e = mp_calloc(1, sizeof(BNS) * (l1 + l2));
673    el = mp_mul(e, op1 + x + x, op2 + x + x, l1, l2);
674
675
676	/* INTERPOLATE COEFFICIENTS */
677
678    /* b = b - 16a - e */
679    size = mp_lshift(rop, a, al, 4);
680    bl = mp_sub(b, b, rop, bl, size);
681    bl = mp_sub(b, b, e, bl, el);
682
683    /* c = c - a - e*/
684    cl = mp_sub(c, c, a, cl, al);
685    cl = mp_sub(c, c, e, cl, el);
686
687    /* d = d - a - 16e */
688    dl = mp_sub(d, d, a, dl, al);
689    size = mp_lshift(rop, e, el, 4);
690    dl = mp_sub(d, d, rop, dl, size);
691
692    /* w = (b + d) - 8c */
693    size = mp_add(rop, b, d, bl, dl);
694    xsize = mp_lshift(rop + size, c, cl, 3);	/* rop has enough storage */
695    size = mp_sub(rop, rop, rop + size, size, xsize);
696
697    /* w = 2c - w*/
698    xsize = mp_lshift(rop + size, c, cl, 1);
699    size = mp_sub(rop, rop + size, rop, xsize, size);
700
701    /* b = b/2 */
702    bl = mp_rshift(b, b, bl, 1);
703
704    /* b = b - c */
705    bl = mp_sub(b, b, c, bl, cl);
706
707    /* c = w / 2 */
708    cl = mp_rshift(c, rop, size, 1);
709
710    /* b = b - c */
711    bl = mp_sub(b, b, c, bl, cl);
712
713    /* b = b/3 */
714	/* maybe the most expensive calculation */
715    i = bl - 1;
716    value = b[i];
717    b[i] = value / 3;
718    for (--i; i >= 0; i--) {
719	carry = value % 3;
720	value = ((BNI)carry << BNSBITS) + b[i];
721	b[i] = (BNS)(value / 3);
722    }
723
724    /* d = d/2 */
725    dl = mp_rshift(d, d, dl, 1);
726
727    /* d = d - b - w */
728    dl = mp_sub(d, d, b, dl, bl);
729    dl = mp_sub(d, d, rop, dl, size);
730
731    /* d = d/4 */
732    dl = mp_rshift(d, d, dl, 2);
733
734
735	/* STORE RESULT IN ROP */
736    /* first clear memory used as temporary variable w and 8c */
737    memset(rop, '\0', sizeof(BNS) * (len1 + len2));
738
739    i = x * 4;
740    xsize = (len1 + len2) - i;
741    size = mp_add(rop + i, rop + i, e, xsize, el) + i;
742    i = x * 3;
743    xsize = size - i;
744    size = mp_add(rop + i, rop + i, d, xsize, dl) + i;
745    i = x * 2;
746    xsize = size - i;
747    size = mp_add(rop + i, rop + i, c, xsize, cl) + i;
748    i = x;
749    xsize = size - i;
750    size = mp_add(rop + i, rop + i, b, xsize, bl) + i;
751    size = mp_add(rop, rop, a, size, al);
752
753    mp_free(e);
754    mp_free(d);
755    mp_free(c);
756    mp_free(b);
757    mp_free(a);
758
759    if (size > 1 && rop[size - 1] == 0)
760	--size;
761
762    return (size);
763}
764
765long
766mp_mul(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2)
767{
768    if (len1 < len2)
769	MP_SWAP(op1, op2, len1, len2);
770
771    if (len1 < KARATSUBA || len2 < KARATSUBA)
772	return (mp_base_mul(rop, op1, op2, len1, len2));
773    else if (len1 < TOOM && len2 < TOOM && len2 > ((len1 + 1) >> 1))
774	return (mp_karatsuba_mul(rop, op1, op2, len1, len2));
775    else if (len1 >= TOOM && len2 >= TOOM && (len2 + 2) / 3 == (len1 + 2) / 3)
776	return (mp_toom_mul(rop, op1, op2, len1, len2));
777    else {
778	long xsize, psize, isize;
779	BNS *ptr;
780
781	/* adjust index pointer and estimated size of result */
782	isize = 0;
783	xsize = len1 + len2;
784	mp_mul(rop, op1, op2, len2, len2);
785	/* adjust pointers */
786	len1 -= len2;
787	op1 += len2;
788
789	/* allocate buffer for intermediate multiplications */
790	if (len1 > len2)
791	    ptr = mp_calloc(1, sizeof(BNS) * (len2 + len2));
792	else
793	    ptr = mp_calloc(1, sizeof(BNS) * (len1 + len2));
794
795	/* loop multiplying len2 size operands at a time */
796	while (len1 >= len2) {
797	    isize += len2;
798	    psize = mp_mul(ptr, op1, op2, len2, len2);
799	    mp_add(rop + isize, rop + isize, ptr, xsize - isize, psize);
800	    len1 -= len2;
801	    op1 += len2;
802
803	    /* multiplication routines require zeroed memory */
804	    memset(ptr, '\0', sizeof(BNS) * (MIN(len1, len2) + len2));
805	}
806
807	/* len1 was not a multiple of len2 */
808	if (len1) {
809	    isize += len2;
810	    psize = mp_mul(ptr, op2, op1, len2, len1);
811	    mp_add(rop + isize, rop + isize, ptr, xsize, psize);
812	}
813
814	/* adjust result size */
815	if (rop[xsize - 1] == 0)
816	    --xsize;
817
818	mp_free(ptr);
819
820	return (xsize);
821    }
822}
823