15dfecf96Smrg/* 25dfecf96Smrg * Copyright (c) 2002 by The XFree86 Project, Inc. 35dfecf96Smrg * 45dfecf96Smrg * Permission is hereby granted, free of charge, to any person obtaining a 55dfecf96Smrg * copy of this software and associated documentation files (the "Software"), 65dfecf96Smrg * to deal in the Software without restriction, including without limitation 75dfecf96Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 85dfecf96Smrg * and/or sell copies of the Software, and to permit persons to whom the 95dfecf96Smrg * Software is furnished to do so, subject to the following conditions: 105dfecf96Smrg * 115dfecf96Smrg * The above copyright notice and this permission notice shall be included in 125dfecf96Smrg * all copies or substantial portions of the Software. 135dfecf96Smrg * 145dfecf96Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 155dfecf96Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 165dfecf96Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 175dfecf96Smrg * THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 185dfecf96Smrg * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF 195dfecf96Smrg * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 205dfecf96Smrg * SOFTWARE. 215dfecf96Smrg * 225dfecf96Smrg * Except as contained in this notice, the name of the XFree86 Project shall 235dfecf96Smrg * not be used in advertising or otherwise to promote the sale, use or other 245dfecf96Smrg * dealings in this Software without prior written authorization from the 255dfecf96Smrg * XFree86 Project. 265dfecf96Smrg * 275dfecf96Smrg * Author: Paulo César Pereira de Andrade 285dfecf96Smrg */ 295dfecf96Smrg 305dfecf96Smrg/* $XFree86: xc/programs/xedit/lisp/mp/mp.c,v 1.2 2002/11/08 08:01:00 paulo Exp $ */ 315dfecf96Smrg 325dfecf96Smrg#include "mp.h" 335dfecf96Smrg 345dfecf96Smrg/* 355dfecf96Smrg * TODO: 365dfecf96Smrg * o Optimize squaring 375dfecf96Smrg * o Write better division code and move from mpi.c to here 385dfecf96Smrg * o Make multiplication code don't required memory to be zeroed 395dfecf96Smrg * + The first step is easy, just multiply the low word, 405dfecf96Smrg * then the high word, that may overlap with the result 415dfecf96Smrg * of the first multiply (in case of carry), and then 425dfecf96Smrg * just make sure carry is properly propagated in the 435dfecf96Smrg * subsequent multiplications. 445dfecf96Smrg * + Some code needs also to be rewritten because some 455dfecf96Smrg * intermediate addition code in mp_mul, mp_karatsuba_mul, 465dfecf96Smrg * and mp_toom_mul is assuming the memory is zeroed. 475dfecf96Smrg */ 485dfecf96Smrg 495dfecf96Smrg/* 505dfecf96Smrg * Prototypes 515dfecf96Smrg */ 525dfecf96Smrg /* out of memory handler */ 535dfecf96Smrgstatic void mp_outmem(void); 545dfecf96Smrg 555dfecf96Smrg /* memory allocation fallback functions */ 565dfecf96Smrgstatic void *_mp_malloc(size_t); 575dfecf96Smrgstatic void *_mp_calloc(size_t, size_t); 585dfecf96Smrgstatic void *_mp_realloc(void*, size_t); 595dfecf96Smrgstatic void _mp_free(void*); 605dfecf96Smrg 615dfecf96Smrg/* 625dfecf96Smrg * Initialization 635dfecf96Smrg */ 645dfecf96Smrgstatic mp_malloc_fun __mp_malloc = _mp_malloc; 655dfecf96Smrgstatic mp_calloc_fun __mp_calloc = _mp_calloc; 665dfecf96Smrgstatic mp_realloc_fun __mp_realloc = _mp_realloc; 675dfecf96Smrgstatic mp_free_fun __mp_free = _mp_free; 685dfecf96Smrg 695dfecf96Smrg/* 705dfecf96Smrg * Implementation 715dfecf96Smrg */ 725dfecf96Smrgstatic void 735dfecf96Smrgmp_outmem(void) 745dfecf96Smrg{ 755dfecf96Smrg fprintf(stderr, "out of memory in MP library.\n"); 765dfecf96Smrg exit(1); 775dfecf96Smrg} 785dfecf96Smrg 795dfecf96Smrgstatic void * 805dfecf96Smrg_mp_malloc(size_t size) 815dfecf96Smrg{ 825dfecf96Smrg return (malloc(size)); 835dfecf96Smrg} 845dfecf96Smrg 855dfecf96Smrgvoid * 865dfecf96Smrgmp_malloc(size_t size) 875dfecf96Smrg{ 885dfecf96Smrg void *pointer = (*__mp_malloc)(size); 895dfecf96Smrg 905dfecf96Smrg if (pointer == NULL) 915dfecf96Smrg mp_outmem(); 925dfecf96Smrg 935dfecf96Smrg return (pointer); 945dfecf96Smrg} 955dfecf96Smrg 965dfecf96Smrgmp_malloc_fun 975dfecf96Smrgmp_set_malloc(mp_malloc_fun fun) 985dfecf96Smrg{ 995dfecf96Smrg mp_malloc_fun old = __mp_malloc; 1005dfecf96Smrg 1015dfecf96Smrg __mp_malloc = fun; 1025dfecf96Smrg 1035dfecf96Smrg return (old); 1045dfecf96Smrg} 1055dfecf96Smrg 1065dfecf96Smrgstatic void * 1075dfecf96Smrg_mp_calloc(size_t nmemb, size_t size) 1085dfecf96Smrg{ 1095dfecf96Smrg return (calloc(nmemb, size)); 1105dfecf96Smrg} 1115dfecf96Smrg 1125dfecf96Smrgvoid * 1135dfecf96Smrgmp_calloc(size_t nmemb, size_t size) 1145dfecf96Smrg{ 1155dfecf96Smrg void *pointer = (*__mp_calloc)(nmemb, size); 1165dfecf96Smrg 1175dfecf96Smrg if (pointer == NULL) 1185dfecf96Smrg mp_outmem(); 1195dfecf96Smrg 1205dfecf96Smrg return (pointer); 1215dfecf96Smrg} 1225dfecf96Smrg 1235dfecf96Smrgmp_calloc_fun 1245dfecf96Smrgmp_set_calloc(mp_calloc_fun fun) 1255dfecf96Smrg{ 1265dfecf96Smrg mp_calloc_fun old = __mp_calloc; 1275dfecf96Smrg 1285dfecf96Smrg __mp_calloc = fun; 1295dfecf96Smrg 1305dfecf96Smrg return (old); 1315dfecf96Smrg} 1325dfecf96Smrg 1335dfecf96Smrgstatic void * 1345dfecf96Smrg_mp_realloc(void *old, size_t size) 1355dfecf96Smrg{ 1365dfecf96Smrg return (realloc(old, size)); 1375dfecf96Smrg} 1385dfecf96Smrg 1395dfecf96Smrgvoid * 1405dfecf96Smrgmp_realloc(void *old, size_t size) 1415dfecf96Smrg{ 1425dfecf96Smrg void *pointer = (*__mp_realloc)(old, size); 1435dfecf96Smrg 1445dfecf96Smrg if (pointer == NULL) 1455dfecf96Smrg mp_outmem(); 1465dfecf96Smrg 1475dfecf96Smrg return (pointer); 1485dfecf96Smrg} 1495dfecf96Smrg 1505dfecf96Smrgmp_realloc_fun 1515dfecf96Smrgmp_set_realloc(mp_realloc_fun fun) 1525dfecf96Smrg{ 1535dfecf96Smrg mp_realloc_fun old = __mp_realloc; 1545dfecf96Smrg 1555dfecf96Smrg __mp_realloc = fun; 1565dfecf96Smrg 1575dfecf96Smrg return (old); 1585dfecf96Smrg} 1595dfecf96Smrg 1605dfecf96Smrgstatic void 1615dfecf96Smrg_mp_free(void *pointer) 1625dfecf96Smrg{ 1635dfecf96Smrg free(pointer); 1645dfecf96Smrg} 1655dfecf96Smrg 1665dfecf96Smrgvoid 1675dfecf96Smrgmp_free(void *pointer) 1685dfecf96Smrg{ 1695dfecf96Smrg (*__mp_free)(pointer); 1705dfecf96Smrg} 1715dfecf96Smrg 1725dfecf96Smrgmp_free_fun 1735dfecf96Smrgmp_set_free(mp_free_fun fun) 1745dfecf96Smrg{ 1755dfecf96Smrg mp_free_fun old = __mp_free; 1765dfecf96Smrg 1775dfecf96Smrg __mp_free = fun; 1785dfecf96Smrg 1795dfecf96Smrg return (old); 1805dfecf96Smrg} 1815dfecf96Smrg 1825dfecf96Smrglong 1835dfecf96Smrgmp_add(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2) 1845dfecf96Smrg{ 1855dfecf96Smrg BNI value; /* intermediate result */ 1865dfecf96Smrg BNS carry; /* carry flag */ 1875dfecf96Smrg long size; /* result size */ 1885dfecf96Smrg 1895dfecf96Smrg if (len1 < len2) 1905dfecf96Smrg MP_SWAP(op1, op2, len1, len2); 1915dfecf96Smrg 1925dfecf96Smrg /* unroll start of loop */ 19331de2854Smrg value = (BNI)op1[0] + op2[0]; 1945dfecf96Smrg rop[0] = value; 1955dfecf96Smrg carry = value >> BNSBITS; 1965dfecf96Smrg 1975dfecf96Smrg /* add op1 and op2 */ 1985dfecf96Smrg for (size = 1; size < len2; size++) { 19931de2854Smrg value = (BNI)op1[size] + op2[size] + carry; 2005dfecf96Smrg rop[size] = value; 2015dfecf96Smrg carry = value >> BNSBITS; 2025dfecf96Smrg } 2035dfecf96Smrg if (rop != op1) { 2045dfecf96Smrg for (; size < len1; size++) { 20531de2854Smrg value = (BNI)op1[size] + carry; 2065dfecf96Smrg rop[size] = value; 2075dfecf96Smrg carry = value >> BNSBITS; 2085dfecf96Smrg } 2095dfecf96Smrg } 2105dfecf96Smrg else { 2115dfecf96Smrg /* if rop == op1, than just adjust carry */ 2125dfecf96Smrg for (; carry && size < len1; size++) { 21331de2854Smrg value = (BNI)op1[size] + carry; 2145dfecf96Smrg rop[size] = value; 2155dfecf96Smrg carry = value >> BNSBITS; 2165dfecf96Smrg } 2175dfecf96Smrg size = len1; 2185dfecf96Smrg } 2195dfecf96Smrg if (carry) 2205dfecf96Smrg rop[size++] = carry; 2215dfecf96Smrg 2225dfecf96Smrg return (size); 2235dfecf96Smrg} 2245dfecf96Smrg 2255dfecf96Smrglong 2265dfecf96Smrgmp_sub(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2) 2275dfecf96Smrg{ 2285dfecf96Smrg long svalue; /* intermediate result */ 2295dfecf96Smrg BNS carry; /* carry flag */ 2305dfecf96Smrg long size; /* result size */ 2315dfecf96Smrg 2325dfecf96Smrg /* special case */ 2335dfecf96Smrg if (op1 == op2) { 2345dfecf96Smrg rop[0] = 0; 2355dfecf96Smrg 2365dfecf96Smrg return (1); 2375dfecf96Smrg } 2385dfecf96Smrg 2395dfecf96Smrg /* unroll start of loop */ 24031de2854Smrg svalue = (long)op1[0] - op2[0]; 2415dfecf96Smrg rop[0] = svalue; 2425dfecf96Smrg carry = svalue < 0; 2435dfecf96Smrg 2445dfecf96Smrg /* subtracts op2 from op1 */ 2455dfecf96Smrg for (size = 1; size < len2; size++) { 2465dfecf96Smrg svalue = (long)(op1[size]) - op2[size] - carry; 2475dfecf96Smrg rop[size] = svalue; 2485dfecf96Smrg carry = svalue < 0; 2495dfecf96Smrg } 2505dfecf96Smrg if (rop != op1) { 2515dfecf96Smrg for (; size < len1; size++) { 2525dfecf96Smrg svalue = op1[size] - carry; 2535dfecf96Smrg rop[size] = svalue; 2545dfecf96Smrg carry = svalue < 0; 2555dfecf96Smrg } 2565dfecf96Smrg } 2575dfecf96Smrg else { 2585dfecf96Smrg /* if rop == op1, than just adjust carry */ 2595dfecf96Smrg for (; carry && size < len1; size++) { 26031de2854Smrg svalue = (long)op1[size] - carry; 2615dfecf96Smrg rop[size] = svalue; 2625dfecf96Smrg carry = svalue < 0; 2635dfecf96Smrg } 2645dfecf96Smrg size = len1; 2655dfecf96Smrg } 2665dfecf96Smrg 2675dfecf96Smrg /* calculate result size */ 2685dfecf96Smrg while (size > 1 && rop[size - 1] == 0) 2695dfecf96Smrg --size; 2705dfecf96Smrg 2715dfecf96Smrg return (size); 2725dfecf96Smrg} 2735dfecf96Smrg 2745dfecf96Smrglong 2755dfecf96Smrgmp_lshift(BNS *rop, BNS *op, BNI len, long shift) 2765dfecf96Smrg{ 2775dfecf96Smrg long i, size; 2785dfecf96Smrg BNI words, bits; /* how many word and bit shifts */ 2795dfecf96Smrg 2805dfecf96Smrg words = shift / BNSBITS; 2815dfecf96Smrg bits = shift % BNSBITS; 2825dfecf96Smrg size = len + words; 2835dfecf96Smrg 2845dfecf96Smrg if (bits) { 2855dfecf96Smrg BNS hi, lo; 2865dfecf96Smrg BNI carry; 2875dfecf96Smrg int adj; 2885dfecf96Smrg 2895dfecf96Smrg for (i = 1, carry = CARRY >> 1; carry; i++, carry >>= 1) 2905dfecf96Smrg if (op[len - 1] & carry) 2915dfecf96Smrg break; 2925dfecf96Smrg adj = (bits + (BNSBITS - i)) / BNSBITS; 2935dfecf96Smrg size += adj; 2945dfecf96Smrg 2955dfecf96Smrg lo = hi = op[0]; 2965dfecf96Smrg rop[words] = lo << bits; 2975dfecf96Smrg for (i = 1; i < len; i++) { 2985dfecf96Smrg hi = op[i]; 2995dfecf96Smrg rop[words + i] = hi << bits | (lo >> (BNSBITS - bits)); 3005dfecf96Smrg lo = hi; 3015dfecf96Smrg } 3025dfecf96Smrg if (adj) 3035dfecf96Smrg rop[size - 1] = hi >> (BNSBITS - bits); 3045dfecf96Smrg } 3055dfecf96Smrg else 3065dfecf96Smrg memmove(rop + size - len, op, sizeof(BNS) * len); 3075dfecf96Smrg 3085dfecf96Smrg if (words) 3095dfecf96Smrg memset(rop, '\0', sizeof(BNS) * words); 3105dfecf96Smrg 3115dfecf96Smrg return (size); 3125dfecf96Smrg} 3135dfecf96Smrg 3145dfecf96Smrglong 3155dfecf96Smrgmp_rshift(BNS *rop, BNS *op, BNI len, long shift) 3165dfecf96Smrg{ 3175dfecf96Smrg int adj = 0; 3185dfecf96Smrg long i, size; 3195dfecf96Smrg BNI words, bits; /* how many word and bit shifts */ 3205dfecf96Smrg 3215dfecf96Smrg words = shift / BNSBITS; 3225dfecf96Smrg bits = shift % BNSBITS; 3235dfecf96Smrg size = len - words; 3245dfecf96Smrg 3255dfecf96Smrg if (bits) { 3265dfecf96Smrg BNS hi, lo; 3275dfecf96Smrg BNI carry; 3285dfecf96Smrg 3295dfecf96Smrg for (i = 0, carry = CARRY >> 1; carry; i++, carry >>= 1) 3305dfecf96Smrg if (op[len - 1] & carry) 3315dfecf96Smrg break; 3325dfecf96Smrg adj = (bits + i) / BNSBITS; 3335dfecf96Smrg if (size - adj == 0) { 3345dfecf96Smrg rop[0] = 0; 3355dfecf96Smrg 3365dfecf96Smrg return (1); 3375dfecf96Smrg } 3385dfecf96Smrg 3395dfecf96Smrg hi = lo = op[words + size - 1]; 3405dfecf96Smrg rop[size - 1] = hi >> bits; 3415dfecf96Smrg for (i = size - 2; i >= 0; i--) { 3425dfecf96Smrg lo = op[words + i]; 3435dfecf96Smrg rop[i] = (lo >> bits) | (hi << (BNSBITS - bits)); 3445dfecf96Smrg hi = lo; 3455dfecf96Smrg } 3465dfecf96Smrg if (adj) 3475dfecf96Smrg rop[0] |= lo << (BNSBITS - bits); 3485dfecf96Smrg } 3495dfecf96Smrg else 3505dfecf96Smrg memmove(rop, op + len - size, size * sizeof(BNS)); 3515dfecf96Smrg 3525dfecf96Smrg return (size - adj); 3535dfecf96Smrg} 3545dfecf96Smrg 3555dfecf96Smrg /* rop must be a pointer to len1 + len2 elements 3565dfecf96Smrg * rop cannot be either op1 or op2 3575dfecf96Smrg * rop must be all zeros */ 3585dfecf96Smrglong 3595dfecf96Smrgmp_base_mul(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2) 3605dfecf96Smrg{ 3615dfecf96Smrg long i, j; /* counters */ 3625dfecf96Smrg BNI value; /* intermediate result */ 3635dfecf96Smrg BNS carry; /* carry value */ 3645dfecf96Smrg long size = len1 + len2; 3655dfecf96Smrg 3665dfecf96Smrg /* simple optimization: first pass does not need to deference rop[i+j] */ 3675dfecf96Smrg if (op1[0]) { 3685dfecf96Smrg value = (BNI)(op1[0]) * op2[0]; 3695dfecf96Smrg rop[0] = value; 3705dfecf96Smrg carry = (BNS)(value >> BNSBITS); 3715dfecf96Smrg for (j = 1; j < len2; j++) { 3725dfecf96Smrg value = (BNI)(op1[0]) * op2[j] + carry; 3735dfecf96Smrg rop[j] = value; 3745dfecf96Smrg carry = (BNS)(value >> BNSBITS); 3755dfecf96Smrg } 3765dfecf96Smrg rop[j] = carry; 3775dfecf96Smrg } 3785dfecf96Smrg 3795dfecf96Smrg /* do the multiplication */ 3805dfecf96Smrg for (i = 1; i < len1; i++) { 3815dfecf96Smrg if (op1[i]) { 3825dfecf96Smrg /* unrool loop initialization */ 3835dfecf96Smrg value = (BNI)(op1[i]) * op2[0] + rop[i]; 3845dfecf96Smrg rop[i] = value; 3855dfecf96Smrg carry = (BNS)(value >> BNSBITS); 3865dfecf96Smrg /* multiply */ 3875dfecf96Smrg for (j = 1; j < len2; j++) { 3885dfecf96Smrg value = (BNI)(op1[i]) * op2[j] + rop[i + j] + carry; 3895dfecf96Smrg rop[i + j] = value; 3905dfecf96Smrg carry = (BNS)(value >> BNSBITS); 3915dfecf96Smrg } 3925dfecf96Smrg rop[i + j] = carry; 3935dfecf96Smrg } 3945dfecf96Smrg } 3955dfecf96Smrg 3965dfecf96Smrg if (size > 1 && rop[size - 1] == 0) 3975dfecf96Smrg --size; 3985dfecf96Smrg 3995dfecf96Smrg return (size); 4005dfecf96Smrg} 4015dfecf96Smrg 4025dfecf96Smrg /* Karatsuba method 4035dfecf96Smrg * t + ((a0 + a1) (b0 + b1) - t - u) x + ux² 4045dfecf96Smrg * where t = a0b0 and u = a1b1 4055dfecf96Smrg * 4065dfecf96Smrg * Karatsuba method reduces the number of multiplications. Example: 4075dfecf96Smrg * Square a 40 length number 4085dfecf96Smrg * instead of a plain 40*40 = 1600 multiplies/adds, it does: 4095dfecf96Smrg * 20*20+20*20+20*20 = 1200 4105dfecf96Smrg * but since it is recursive, every 20*20=400 is reduced to 4115dfecf96Smrg * 10*10+10*10+10*10=300 4125dfecf96Smrg * and so on. 4135dfecf96Smrg * The multiplication by x and x² is a just a shift, as it is a 4145dfecf96Smrg * power of two, and is implemented below by just writting at the 4155dfecf96Smrg * correct offset */ 4165dfecf96Smrglong 4175dfecf96Smrgmp_karatsuba_mul(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2) 4185dfecf96Smrg{ 4195dfecf96Smrg BNI x; /* shift count */ 4205dfecf96Smrg BNI la0, la1, lb0, lb1; /* length of a0, a1, b0, and b1 */ 4215dfecf96Smrg BNS *t; /* temporary memory for t product */ 4225dfecf96Smrg BNS *u; /* temporary memory for u product */ 4235dfecf96Smrg BNS *r; /* pointer to rop */ 4245dfecf96Smrg long xlen, tlen, ulen; 4255dfecf96Smrg 4265dfecf96Smrg /* calculate value of x, that is 2^(BNSBITS*x) */ 4275dfecf96Smrg if (len1 >= len2) 4285dfecf96Smrg x = (len1 + 1) >> 1; 4295dfecf96Smrg else 4305dfecf96Smrg x = (len2 + 1) >> 1; 4315dfecf96Smrg 4325dfecf96Smrg /* calculate length of operands */ 4335dfecf96Smrg la0 = x; 4345dfecf96Smrg la1 = len1 - x; 4355dfecf96Smrg lb0 = x; 4365dfecf96Smrg lb1 = len2 - x; 4375dfecf96Smrg 4385dfecf96Smrg /* allocate buffer for t and (a0 + a1) */ 4395dfecf96Smrg tlen = la0 + lb0; 4405dfecf96Smrg t = mp_malloc(sizeof(BNS) * tlen); 4415dfecf96Smrg 4425dfecf96Smrg /* allocate buffer for u and (b0 + b1) */ 4435dfecf96Smrg if (la1 + lb1 < lb0 + lb1 + 1) 4445dfecf96Smrg ulen = lb0 + lb1 + 1; 4455dfecf96Smrg else 4465dfecf96Smrg ulen = la1 + lb1; 4475dfecf96Smrg u = mp_malloc(sizeof(BNS) * ulen); 4485dfecf96Smrg 4495dfecf96Smrg /* calculate a0 + a1, store result in t */ 4505dfecf96Smrg tlen = mp_add(t, op1, op1 + x, la0, la1); 4515dfecf96Smrg 4525dfecf96Smrg /* calculate b0 + b1, store result in u */ 4535dfecf96Smrg ulen = mp_add(u, op2, op2 + x, lb0, lb1); 4545dfecf96Smrg 4555dfecf96Smrg /* store (a0 + a1) * (b0 + b1) in rop */ 4565dfecf96Smrg 4575dfecf96Smrg r = rop + x; /* multiplied by 2^(BNSBITS*x) */ 4585dfecf96Smrg xlen = mp_mul(r, t, u, tlen, ulen); 4595dfecf96Smrg 4605dfecf96Smrg /* must zero t and u memory, this is required for mp_mul */ 4615dfecf96Smrg 4625dfecf96Smrg /* calculate t = a0 * b0 */ 4635dfecf96Smrg tlen = la0 + lb0; 4645dfecf96Smrg memset(t, '\0', sizeof(BNS) * tlen); 4655dfecf96Smrg tlen = mp_mul(t, op1, op2, la0, lb0); 4665dfecf96Smrg 4675dfecf96Smrg /* calculate u = a1 * b1 */ 4685dfecf96Smrg ulen = la1 + lb1; 4695dfecf96Smrg memset(u, '\0', sizeof(BNS) * ulen); 4705dfecf96Smrg ulen = mp_mul(u, op1 + x, op2 + x, la1, lb1); 4715dfecf96Smrg 4725dfecf96Smrg /* subtract t from partial result */ 4735dfecf96Smrg xlen = mp_sub(r, r, t, xlen, tlen); 4745dfecf96Smrg 4755dfecf96Smrg /* subtract u form partial result */ 4765dfecf96Smrg xlen = mp_sub(r, r, u, xlen, ulen); 4775dfecf96Smrg 4785dfecf96Smrg /* add ux^2 to partial result */ 4795dfecf96Smrg 4805dfecf96Smrg r = rop + (x << 1); /* multiplied by x^2 = 2^(BNSBITS*x*2) */ 4815dfecf96Smrg xlen = len1 + len2; 4825dfecf96Smrg xlen = mp_add(r, r, u, xlen, ulen); 4835dfecf96Smrg 4845dfecf96Smrg /* now add t to final result */ 4855dfecf96Smrg xlen = mp_add(rop, rop, t, xlen, tlen); 4865dfecf96Smrg 4875dfecf96Smrg mp_free(t); 4885dfecf96Smrg mp_free(u); 4895dfecf96Smrg 4905dfecf96Smrg if (xlen > 1 && rop[xlen - 1] == 0) 4915dfecf96Smrg --xlen; 4925dfecf96Smrg 4935dfecf96Smrg return (xlen); 4945dfecf96Smrg} 4955dfecf96Smrg 4965dfecf96Smrg /* Toom method (partially based on GMP documentation) 4975dfecf96Smrg * Evaluation at k = [ 0 1/2 1 2 oo ] 4985dfecf96Smrg * U(x) = (U2k + U1)k + U0 4995dfecf96Smrg * V(x) = (V2k + V1)k + V0 5005dfecf96Smrg * W(x) = U(x)V(x) 5015dfecf96Smrg * 5025dfecf96Smrg * Sample: 5035dfecf96Smrg * 123 * 456 5045dfecf96Smrg * 5055dfecf96Smrg * EVALUATION: 5065dfecf96Smrg * U(0) = (1*0+2)*0+3 => 3 5075dfecf96Smrg * U(1) = 1+(2+3*2)*2 => 17 5085dfecf96Smrg * U(2) = 1+2+3 => 6 5095dfecf96Smrg * U(3) = (1*2+2)*2+3 => 11 5105dfecf96Smrg * U(4) = 1+(2+3*0)*0 => 1 5115dfecf96Smrg * 5125dfecf96Smrg * V(0) = (4*0+5)*0+6 => 6 5135dfecf96Smrg * V(1) = 4+(5+6*2)*2 => 38 5145dfecf96Smrg * V(2) = 4+5+6 => 15 5155dfecf96Smrg * V(3) = (4*2+5)*2+6 => 32 5165dfecf96Smrg * V(4) = 4+(5+6*0)*0 => 4 5175dfecf96Smrg * 5185dfecf96Smrg * U = [ 3 17 6 11 1 ] 5195dfecf96Smrg * V = [ 6 38 15 32 4 ] 5205dfecf96Smrg * W = [ 18 646 90 352 4 ] 5215dfecf96Smrg * 5225dfecf96Smrg * After that, we have: 5235dfecf96Smrg * a = 18 (w0 already known) 5245dfecf96Smrg * b = 16w0 + 8w1 + 4w2 + 2w3 + w4 5255dfecf96Smrg * c = w0 + w1 + w2 + w3 + w4 5265dfecf96Smrg * d = w0 + 2w1 + 4w2 + 8w3 + 16w4 5275dfecf96Smrg * e = 4 (w4 already known) 5285dfecf96Smrg * 5295dfecf96Smrg * INTERPOLATION: 5305dfecf96Smrg * b = b -16a - e (354) 5315dfecf96Smrg * c = c - a - e (68) 5325dfecf96Smrg * d = d - a - 16e (270) 5335dfecf96Smrg * 5345dfecf96Smrg * w = (b + d) - 8c = (10w1+8w2+10w3) - (8w1+8w2+8w3) = 2w1+2w3 5355dfecf96Smrg * w = 2c - w (56) 5365dfecf96Smrg * b = b/2 = 4w1+w+w3 5375dfecf96Smrg * b = b-c = 4w1+w+w3 - w1+w2+w3 = 3w1+w2 5385dfecf96Smrg * c = w/2 (w2 = 28) 5395dfecf96Smrg * b = b-c = 3w1+c - c = 3w1 5405dfecf96Smrg * b = b/3 (w1 = 27) 5415dfecf96Smrg * d = d/2 5425dfecf96Smrg * d = d-b-w = b+w+4w3 - b-w = 4w3 5435dfecf96Smrg * d = d/4 (w3 = 13) 5445dfecf96Smrg * 5455dfecf96Smrg * RESULT: 5465dfecf96Smrg * w4*10^4 + w3*10³ + w2*10² + w1*10 + w0 5475dfecf96Smrg * 40000 + 13000 + 2800 + 270 + 18 5485dfecf96Smrg * 10 is the base where the calculation was done 5495dfecf96Smrg * 5505dfecf96Smrg * This sample uses small numbers, so it does not show the 5515dfecf96Smrg * advantage of the method. But for example (in base 10), when squaring 5525dfecf96Smrg * 123456789012345678901234567890 5535dfecf96Smrg * The normal method would do 30*30=900 multiplications 5545dfecf96Smrg * Karatsuba method would do 15*15*3=675 multiplications 5555dfecf96Smrg * Toom method would do 10*10*5=500 multiplications 5565dfecf96Smrg * Toom method has a larger overhead if compared with Karatsuba method, 5575dfecf96Smrg * due to evaluation and interpolation, so it should be used for larger 5585dfecf96Smrg * numbers, so that the computation time of evaluation/interpolation 5595dfecf96Smrg * would be smaller than the time spent using other methods. 5605dfecf96Smrg * 5615dfecf96Smrg * Note that Karatsuba method can be seen as a special case of 5625dfecf96Smrg * Toom method, i.e: 5635dfecf96Smrg * U1U0 * V1V0 5645dfecf96Smrg * with k = [ 0 1 oo ] 5655dfecf96Smrg * U = [ U0 U1+U0 U1 ] 5665dfecf96Smrg * V = [ V0 V1+V0 V1 ] 5675dfecf96Smrg * W = [ U0*V0 (U1+U0)*(V1+V0) (U1+V1) ] 5685dfecf96Smrg * 5695dfecf96Smrg * w0 = U0*V0 5705dfecf96Smrg * w = (U1+U0)*(V1+V0) 5715dfecf96Smrg * w2 = (U1*V1) 5725dfecf96Smrg * 5735dfecf96Smrg * w1 = w - w0 - w2 5745dfecf96Smrg * w2x² + w1x + w0 5755dfecf96Smrg * 5765dfecf96Smrg * See Knuth's Seminumerical Algorithms for a sample implemention 5775dfecf96Smrg * using 4 stacks and k = [ 0 1 2 3 ... ], based on the size of the 5785dfecf96Smrg * input. 5795dfecf96Smrg */ 5805dfecf96Smrglong 5815dfecf96Smrgmp_toom_mul(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2) 5825dfecf96Smrg{ 5835dfecf96Smrg long size, xsize, i; 5845dfecf96Smrg BNI value; /* used in division */ 5855dfecf96Smrg BNS carry; 5865dfecf96Smrg BNI x; /* shift count */ 5875dfecf96Smrg BNI l1, l2; 5885dfecf96Smrg BNI al, bl, cl, dl, el, Ul[3], Vl[3]; 5895dfecf96Smrg BNS *a, *b, *c, *d, *e, *U[3], *V[3]; 5905dfecf96Smrg 5915dfecf96Smrg /* x is the base i.e. 2^(BNSBITS*x) */ 5925dfecf96Smrg x = (len1 + len2 + 4) / 6; 5935dfecf96Smrg l1 = len1 - (x << 1); /* length of remaining piece of op1 */ 5945dfecf96Smrg l2 = len2 - (x << 1); /* length of remaining piece of op2 */ 5955dfecf96Smrg 5965dfecf96Smrg /* allocate memory for storing U and V */ 5975dfecf96Smrg U[0] = mp_malloc(sizeof(BNS) * (x + 2)); 5985dfecf96Smrg V[0] = mp_malloc(sizeof(BNS) * (x + 2)); 5995dfecf96Smrg U[1] = mp_malloc(sizeof(BNS) * (x + 1)); 6005dfecf96Smrg V[1] = mp_malloc(sizeof(BNS) * (x + 1)); 6015dfecf96Smrg U[2] = mp_malloc(sizeof(BNS) * (x + 2)); 6025dfecf96Smrg V[2] = mp_malloc(sizeof(BNS) * (x + 2)); 6035dfecf96Smrg 6045dfecf96Smrg /* EVALUATE U AND V */ 6055dfecf96Smrg 6065dfecf96Smrg /* Numbers are in the format U2x²+U1x+U0 and V2x²+V1x+V0 */ 6075dfecf96Smrg 6085dfecf96Smrg /* U[0] = U2+U1*2+U0*4 */ 6095dfecf96Smrg 6105dfecf96Smrg /* store U1*2 in U[1], this value is used twice */ 6115dfecf96Smrg Ul[1] = mp_lshift(U[1], op1 + x, x, 1); 6125dfecf96Smrg 6135dfecf96Smrg /* store U0*4 in U[0] */ 6145dfecf96Smrg Ul[0] = mp_lshift(U[0], op1, x, 2); 6155dfecf96Smrg /* add U1*2 to U[0] */ 6165dfecf96Smrg Ul[0] = mp_add(U[0], U[0], U[1], Ul[0], Ul[1]); 6175dfecf96Smrg /* add U2 to U[0] */ 6185dfecf96Smrg Ul[0] = mp_add(U[0], U[0], op1 + x + x, Ul[0], l1); 6195dfecf96Smrg 6205dfecf96Smrg /* U[2] = U2*4+U1*2+U0 */ 6215dfecf96Smrg 6225dfecf96Smrg /* store U2*4 in U[2] */ 6235dfecf96Smrg Ul[2] = mp_lshift(U[2], op1 + x + x, l1, 2); 6245dfecf96Smrg /* add U1*2 to U[2] */ 6255dfecf96Smrg Ul[2] = mp_add(U[2], U[2], U[1], Ul[2], Ul[1]); 6265dfecf96Smrg /* add U0 to U[2] */ 6275dfecf96Smrg Ul[2] = mp_add(U[2], U[2], op1, Ul[2], x); 6285dfecf96Smrg 6295dfecf96Smrg /* U[1] = U2+U1+U0 */ 6305dfecf96Smrg 6315dfecf96Smrg Ul[1] = mp_add(U[1], op1, op1 + x, x, x); 6325dfecf96Smrg Ul[1] = mp_add(U[1], U[1], op1 + x + x, Ul[1], l1); 6335dfecf96Smrg 6345dfecf96Smrg 6355dfecf96Smrg /* Evaluate V[x], same code as U[x] */ 6365dfecf96Smrg Vl[1] = mp_lshift(V[1], op2 + x, x, 1); 6375dfecf96Smrg Vl[0] = mp_lshift(V[0], op2, x, 2); 6385dfecf96Smrg Vl[0] = mp_add(V[0], V[0], V[1], Vl[0], Vl[1]); 6395dfecf96Smrg Vl[0] = mp_add(V[0], V[0], op2 + x + x, Vl[0], l2); 6405dfecf96Smrg Vl[2] = mp_lshift(V[2], op2 + x + x, l2, 2); 6415dfecf96Smrg Vl[2] = mp_add(V[2], V[2], V[1], Vl[2], Vl[1]); 6425dfecf96Smrg Vl[2] = mp_add(V[2], V[2], op2, Vl[2], x); 6435dfecf96Smrg Vl[1] = mp_add(V[1], op2, op2 + x, x, x); 6445dfecf96Smrg Vl[1] = mp_add(V[1], V[1], op2 + x + x, Vl[1], l2); 6455dfecf96Smrg 6465dfecf96Smrg 6475dfecf96Smrg /* MULTIPLY U[] AND V[] */ 6485dfecf96Smrg 6495dfecf96Smrg /* calculate (U2+U1*2+U0*4) * (V2+V1*2+V0*4) */ 6505dfecf96Smrg b = mp_calloc(1, sizeof(BNS) * (Ul[0] * Vl[0])); 6515dfecf96Smrg bl = mp_mul(b, U[0], V[0], Ul[0], Vl[0]); 6525dfecf96Smrg mp_free(U[0]); 6535dfecf96Smrg mp_free(V[0]); 6545dfecf96Smrg 6555dfecf96Smrg /* calculate (U2+U1+U0) * (V2+V1+V0) */ 6565dfecf96Smrg c = mp_calloc(1, sizeof(BNS) * (Ul[1] * Vl[1])); 6575dfecf96Smrg cl = mp_mul(c, U[1], V[1], Ul[1], Vl[1]); 6585dfecf96Smrg mp_free(U[1]); 6595dfecf96Smrg mp_free(V[1]); 6605dfecf96Smrg 6615dfecf96Smrg /* calculate (U2*4+U1*2+U0) * (V2*4+V1*2+V0) */ 6625dfecf96Smrg d = mp_calloc(1, sizeof(BNS) * (Ul[2] * Vl[2])); 6635dfecf96Smrg dl = mp_mul(d, U[2], V[2], Ul[2], Vl[2]); 6645dfecf96Smrg mp_free(U[2]); 6655dfecf96Smrg mp_free(V[2]); 6665dfecf96Smrg 6675dfecf96Smrg /* calculate U0 * V0 */ 6685dfecf96Smrg a = mp_calloc(1, sizeof(BNS) * (x + x)); 6695dfecf96Smrg al = mp_mul(a, op1, op2, x, x); 6705dfecf96Smrg 6715dfecf96Smrg /* calculate U2 * V2 */ 6725dfecf96Smrg e = mp_calloc(1, sizeof(BNS) * (l1 + l2)); 6735dfecf96Smrg el = mp_mul(e, op1 + x + x, op2 + x + x, l1, l2); 6745dfecf96Smrg 6755dfecf96Smrg 6765dfecf96Smrg /* INTERPOLATE COEFFICIENTS */ 6775dfecf96Smrg 6785dfecf96Smrg /* b = b - 16a - e */ 6795dfecf96Smrg size = mp_lshift(rop, a, al, 4); 6805dfecf96Smrg bl = mp_sub(b, b, rop, bl, size); 6815dfecf96Smrg bl = mp_sub(b, b, e, bl, el); 6825dfecf96Smrg 6835dfecf96Smrg /* c = c - a - e*/ 6845dfecf96Smrg cl = mp_sub(c, c, a, cl, al); 6855dfecf96Smrg cl = mp_sub(c, c, e, cl, el); 6865dfecf96Smrg 6875dfecf96Smrg /* d = d - a - 16e */ 6885dfecf96Smrg dl = mp_sub(d, d, a, dl, al); 6895dfecf96Smrg size = mp_lshift(rop, e, el, 4); 6905dfecf96Smrg dl = mp_sub(d, d, rop, dl, size); 6915dfecf96Smrg 6925dfecf96Smrg /* w = (b + d) - 8c */ 6935dfecf96Smrg size = mp_add(rop, b, d, bl, dl); 6945dfecf96Smrg xsize = mp_lshift(rop + size, c, cl, 3); /* rop has enough storage */ 6955dfecf96Smrg size = mp_sub(rop, rop, rop + size, size, xsize); 6965dfecf96Smrg 6975dfecf96Smrg /* w = 2c - w*/ 6985dfecf96Smrg xsize = mp_lshift(rop + size, c, cl, 1); 6995dfecf96Smrg size = mp_sub(rop, rop + size, rop, xsize, size); 7005dfecf96Smrg 7015dfecf96Smrg /* b = b/2 */ 7025dfecf96Smrg bl = mp_rshift(b, b, bl, 1); 7035dfecf96Smrg 7045dfecf96Smrg /* b = b - c */ 7055dfecf96Smrg bl = mp_sub(b, b, c, bl, cl); 7065dfecf96Smrg 7075dfecf96Smrg /* c = w / 2 */ 7085dfecf96Smrg cl = mp_rshift(c, rop, size, 1); 7095dfecf96Smrg 7105dfecf96Smrg /* b = b - c */ 7115dfecf96Smrg bl = mp_sub(b, b, c, bl, cl); 7125dfecf96Smrg 7135dfecf96Smrg /* b = b/3 */ 7145dfecf96Smrg /* maybe the most expensive calculation */ 7155dfecf96Smrg i = bl - 1; 7165dfecf96Smrg value = b[i]; 7175dfecf96Smrg b[i] = value / 3; 7185dfecf96Smrg for (--i; i >= 0; i--) { 7195dfecf96Smrg carry = value % 3; 7205dfecf96Smrg value = ((BNI)carry << BNSBITS) + b[i]; 7215dfecf96Smrg b[i] = (BNS)(value / 3); 7225dfecf96Smrg } 7235dfecf96Smrg 7245dfecf96Smrg /* d = d/2 */ 7255dfecf96Smrg dl = mp_rshift(d, d, dl, 1); 7265dfecf96Smrg 7275dfecf96Smrg /* d = d - b - w */ 7285dfecf96Smrg dl = mp_sub(d, d, b, dl, bl); 7295dfecf96Smrg dl = mp_sub(d, d, rop, dl, size); 7305dfecf96Smrg 7315dfecf96Smrg /* d = d/4 */ 7325dfecf96Smrg dl = mp_rshift(d, d, dl, 2); 7335dfecf96Smrg 7345dfecf96Smrg 7355dfecf96Smrg /* STORE RESULT IN ROP */ 7365dfecf96Smrg /* first clear memory used as temporary variable w and 8c */ 7375dfecf96Smrg memset(rop, '\0', sizeof(BNS) * (len1 + len2)); 7385dfecf96Smrg 7395dfecf96Smrg i = x * 4; 7405dfecf96Smrg xsize = (len1 + len2) - i; 7415dfecf96Smrg size = mp_add(rop + i, rop + i, e, xsize, el) + i; 7425dfecf96Smrg i = x * 3; 7435dfecf96Smrg xsize = size - i; 7445dfecf96Smrg size = mp_add(rop + i, rop + i, d, xsize, dl) + i; 7455dfecf96Smrg i = x * 2; 7465dfecf96Smrg xsize = size - i; 7475dfecf96Smrg size = mp_add(rop + i, rop + i, c, xsize, cl) + i; 7485dfecf96Smrg i = x; 7495dfecf96Smrg xsize = size - i; 7505dfecf96Smrg size = mp_add(rop + i, rop + i, b, xsize, bl) + i; 7515dfecf96Smrg size = mp_add(rop, rop, a, size, al); 7525dfecf96Smrg 7535dfecf96Smrg mp_free(e); 7545dfecf96Smrg mp_free(d); 7555dfecf96Smrg mp_free(c); 7565dfecf96Smrg mp_free(b); 7575dfecf96Smrg mp_free(a); 7585dfecf96Smrg 7595dfecf96Smrg if (size > 1 && rop[size - 1] == 0) 7605dfecf96Smrg --size; 7615dfecf96Smrg 7625dfecf96Smrg return (size); 7635dfecf96Smrg} 7645dfecf96Smrg 7655dfecf96Smrglong 7665dfecf96Smrgmp_mul(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2) 7675dfecf96Smrg{ 7685dfecf96Smrg if (len1 < len2) 7695dfecf96Smrg MP_SWAP(op1, op2, len1, len2); 7705dfecf96Smrg 7715dfecf96Smrg if (len1 < KARATSUBA || len2 < KARATSUBA) 7725dfecf96Smrg return (mp_base_mul(rop, op1, op2, len1, len2)); 7735dfecf96Smrg else if (len1 < TOOM && len2 < TOOM && len2 > ((len1 + 1) >> 1)) 7745dfecf96Smrg return (mp_karatsuba_mul(rop, op1, op2, len1, len2)); 7755dfecf96Smrg else if (len1 >= TOOM && len2 >= TOOM && (len2 + 2) / 3 == (len1 + 2) / 3) 7765dfecf96Smrg return (mp_toom_mul(rop, op1, op2, len1, len2)); 7775dfecf96Smrg else { 7785dfecf96Smrg long xsize, psize, isize; 7795dfecf96Smrg BNS *ptr; 7805dfecf96Smrg 7815dfecf96Smrg /* adjust index pointer and estimated size of result */ 7825dfecf96Smrg isize = 0; 7835dfecf96Smrg xsize = len1 + len2; 7845dfecf96Smrg mp_mul(rop, op1, op2, len2, len2); 7855dfecf96Smrg /* adjust pointers */ 7865dfecf96Smrg len1 -= len2; 7875dfecf96Smrg op1 += len2; 7885dfecf96Smrg 7895dfecf96Smrg /* allocate buffer for intermediate multiplications */ 7905dfecf96Smrg if (len1 > len2) 7915dfecf96Smrg ptr = mp_calloc(1, sizeof(BNS) * (len2 + len2)); 7925dfecf96Smrg else 7935dfecf96Smrg ptr = mp_calloc(1, sizeof(BNS) * (len1 + len2)); 7945dfecf96Smrg 7955dfecf96Smrg /* loop multiplying len2 size operands at a time */ 7965dfecf96Smrg while (len1 >= len2) { 7975dfecf96Smrg isize += len2; 7985dfecf96Smrg psize = mp_mul(ptr, op1, op2, len2, len2); 7995dfecf96Smrg mp_add(rop + isize, rop + isize, ptr, xsize - isize, psize); 8005dfecf96Smrg len1 -= len2; 8015dfecf96Smrg op1 += len2; 8025dfecf96Smrg 8035dfecf96Smrg /* multiplication routines require zeroed memory */ 8045dfecf96Smrg memset(ptr, '\0', sizeof(BNS) * (MIN(len1, len2) + len2)); 8055dfecf96Smrg } 8065dfecf96Smrg 8075dfecf96Smrg /* len1 was not a multiple of len2 */ 8085dfecf96Smrg if (len1) { 8095dfecf96Smrg isize += len2; 8105dfecf96Smrg psize = mp_mul(ptr, op2, op1, len2, len1); 8115dfecf96Smrg mp_add(rop + isize, rop + isize, ptr, xsize, psize); 8125dfecf96Smrg } 8135dfecf96Smrg 8145dfecf96Smrg /* adjust result size */ 8155dfecf96Smrg if (rop[xsize - 1] == 0) 8165dfecf96Smrg --xsize; 8175dfecf96Smrg 8185dfecf96Smrg mp_free(ptr); 8195dfecf96Smrg 8205dfecf96Smrg return (xsize); 8215dfecf96Smrg } 8225dfecf96Smrg} 823