mp.c revision 31de2854
1/* 2 * Copyright (c) 2002 by The XFree86 Project, Inc. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF 19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 * SOFTWARE. 21 * 22 * Except as contained in this notice, the name of the XFree86 Project shall 23 * not be used in advertising or otherwise to promote the sale, use or other 24 * dealings in this Software without prior written authorization from the 25 * XFree86 Project. 26 * 27 * Author: Paulo César Pereira de Andrade 28 */ 29 30/* $XFree86: xc/programs/xedit/lisp/mp/mp.c,v 1.2 2002/11/08 08:01:00 paulo Exp $ */ 31 32#include "mp.h" 33 34/* 35 * TODO: 36 * o Optimize squaring 37 * o Write better division code and move from mpi.c to here 38 * o Make multiplication code don't required memory to be zeroed 39 * + The first step is easy, just multiply the low word, 40 * then the high word, that may overlap with the result 41 * of the first multiply (in case of carry), and then 42 * just make sure carry is properly propagated in the 43 * subsequent multiplications. 44 * + Some code needs also to be rewritten because some 45 * intermediate addition code in mp_mul, mp_karatsuba_mul, 46 * and mp_toom_mul is assuming the memory is zeroed. 47 */ 48 49/* 50 * Prototypes 51 */ 52 /* out of memory handler */ 53static void mp_outmem(void); 54 55 /* memory allocation fallback functions */ 56static void *_mp_malloc(size_t); 57static void *_mp_calloc(size_t, size_t); 58static void *_mp_realloc(void*, size_t); 59static void _mp_free(void*); 60 61/* 62 * Initialization 63 */ 64static mp_malloc_fun __mp_malloc = _mp_malloc; 65static mp_calloc_fun __mp_calloc = _mp_calloc; 66static mp_realloc_fun __mp_realloc = _mp_realloc; 67static mp_free_fun __mp_free = _mp_free; 68 69/* 70 * Implementation 71 */ 72static void 73mp_outmem(void) 74{ 75 fprintf(stderr, "out of memory in MP library.\n"); 76 exit(1); 77} 78 79static void * 80_mp_malloc(size_t size) 81{ 82 return (malloc(size)); 83} 84 85void * 86mp_malloc(size_t size) 87{ 88 void *pointer = (*__mp_malloc)(size); 89 90 if (pointer == NULL) 91 mp_outmem(); 92 93 return (pointer); 94} 95 96mp_malloc_fun 97mp_set_malloc(mp_malloc_fun fun) 98{ 99 mp_malloc_fun old = __mp_malloc; 100 101 __mp_malloc = fun; 102 103 return (old); 104} 105 106static void * 107_mp_calloc(size_t nmemb, size_t size) 108{ 109 return (calloc(nmemb, size)); 110} 111 112void * 113mp_calloc(size_t nmemb, size_t size) 114{ 115 void *pointer = (*__mp_calloc)(nmemb, size); 116 117 if (pointer == NULL) 118 mp_outmem(); 119 120 return (pointer); 121} 122 123mp_calloc_fun 124mp_set_calloc(mp_calloc_fun fun) 125{ 126 mp_calloc_fun old = __mp_calloc; 127 128 __mp_calloc = fun; 129 130 return (old); 131} 132 133static void * 134_mp_realloc(void *old, size_t size) 135{ 136 return (realloc(old, size)); 137} 138 139void * 140mp_realloc(void *old, size_t size) 141{ 142 void *pointer = (*__mp_realloc)(old, size); 143 144 if (pointer == NULL) 145 mp_outmem(); 146 147 return (pointer); 148} 149 150mp_realloc_fun 151mp_set_realloc(mp_realloc_fun fun) 152{ 153 mp_realloc_fun old = __mp_realloc; 154 155 __mp_realloc = fun; 156 157 return (old); 158} 159 160static void 161_mp_free(void *pointer) 162{ 163 free(pointer); 164} 165 166void 167mp_free(void *pointer) 168{ 169 (*__mp_free)(pointer); 170} 171 172mp_free_fun 173mp_set_free(mp_free_fun fun) 174{ 175 mp_free_fun old = __mp_free; 176 177 __mp_free = fun; 178 179 return (old); 180} 181 182long 183mp_add(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2) 184{ 185 BNI value; /* intermediate result */ 186 BNS carry; /* carry flag */ 187 long size; /* result size */ 188 189 if (len1 < len2) 190 MP_SWAP(op1, op2, len1, len2); 191 192 /* unroll start of loop */ 193 value = (BNI)op1[0] + op2[0]; 194 rop[0] = value; 195 carry = value >> BNSBITS; 196 197 /* add op1 and op2 */ 198 for (size = 1; size < len2; size++) { 199 value = (BNI)op1[size] + op2[size] + carry; 200 rop[size] = value; 201 carry = value >> BNSBITS; 202 } 203 if (rop != op1) { 204 for (; size < len1; size++) { 205 value = (BNI)op1[size] + carry; 206 rop[size] = value; 207 carry = value >> BNSBITS; 208 } 209 } 210 else { 211 /* if rop == op1, than just adjust carry */ 212 for (; carry && size < len1; size++) { 213 value = (BNI)op1[size] + carry; 214 rop[size] = value; 215 carry = value >> BNSBITS; 216 } 217 size = len1; 218 } 219 if (carry) 220 rop[size++] = carry; 221 222 return (size); 223} 224 225long 226mp_sub(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2) 227{ 228 long svalue; /* intermediate result */ 229 BNS carry; /* carry flag */ 230 long size; /* result size */ 231 232 /* special case */ 233 if (op1 == op2) { 234 rop[0] = 0; 235 236 return (1); 237 } 238 239 /* unroll start of loop */ 240 svalue = (long)op1[0] - op2[0]; 241 rop[0] = svalue; 242 carry = svalue < 0; 243 244 /* subtracts op2 from op1 */ 245 for (size = 1; size < len2; size++) { 246 svalue = (long)(op1[size]) - op2[size] - carry; 247 rop[size] = svalue; 248 carry = svalue < 0; 249 } 250 if (rop != op1) { 251 for (; size < len1; size++) { 252 svalue = op1[size] - carry; 253 rop[size] = svalue; 254 carry = svalue < 0; 255 } 256 } 257 else { 258 /* if rop == op1, than just adjust carry */ 259 for (; carry && size < len1; size++) { 260 svalue = (long)op1[size] - carry; 261 rop[size] = svalue; 262 carry = svalue < 0; 263 } 264 size = len1; 265 } 266 267 /* calculate result size */ 268 while (size > 1 && rop[size - 1] == 0) 269 --size; 270 271 return (size); 272} 273 274long 275mp_lshift(BNS *rop, BNS *op, BNI len, long shift) 276{ 277 long i, size; 278 BNI words, bits; /* how many word and bit shifts */ 279 280 words = shift / BNSBITS; 281 bits = shift % BNSBITS; 282 size = len + words; 283 284 if (bits) { 285 BNS hi, lo; 286 BNI carry; 287 int adj; 288 289 for (i = 1, carry = CARRY >> 1; carry; i++, carry >>= 1) 290 if (op[len - 1] & carry) 291 break; 292 adj = (bits + (BNSBITS - i)) / BNSBITS; 293 size += adj; 294 295 lo = hi = op[0]; 296 rop[words] = lo << bits; 297 for (i = 1; i < len; i++) { 298 hi = op[i]; 299 rop[words + i] = hi << bits | (lo >> (BNSBITS - bits)); 300 lo = hi; 301 } 302 if (adj) 303 rop[size - 1] = hi >> (BNSBITS - bits); 304 } 305 else 306 memmove(rop + size - len, op, sizeof(BNS) * len); 307 308 if (words) 309 memset(rop, '\0', sizeof(BNS) * words); 310 311 return (size); 312} 313 314long 315mp_rshift(BNS *rop, BNS *op, BNI len, long shift) 316{ 317 int adj = 0; 318 long i, size; 319 BNI words, bits; /* how many word and bit shifts */ 320 321 words = shift / BNSBITS; 322 bits = shift % BNSBITS; 323 size = len - words; 324 325 if (bits) { 326 BNS hi, lo; 327 BNI carry; 328 329 for (i = 0, carry = CARRY >> 1; carry; i++, carry >>= 1) 330 if (op[len - 1] & carry) 331 break; 332 adj = (bits + i) / BNSBITS; 333 if (size - adj == 0) { 334 rop[0] = 0; 335 336 return (1); 337 } 338 339 hi = lo = op[words + size - 1]; 340 rop[size - 1] = hi >> bits; 341 for (i = size - 2; i >= 0; i--) { 342 lo = op[words + i]; 343 rop[i] = (lo >> bits) | (hi << (BNSBITS - bits)); 344 hi = lo; 345 } 346 if (adj) 347 rop[0] |= lo << (BNSBITS - bits); 348 } 349 else 350 memmove(rop, op + len - size, size * sizeof(BNS)); 351 352 return (size - adj); 353} 354 355 /* rop must be a pointer to len1 + len2 elements 356 * rop cannot be either op1 or op2 357 * rop must be all zeros */ 358long 359mp_base_mul(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2) 360{ 361 long i, j; /* counters */ 362 BNI value; /* intermediate result */ 363 BNS carry; /* carry value */ 364 long size = len1 + len2; 365 366 /* simple optimization: first pass does not need to deference rop[i+j] */ 367 if (op1[0]) { 368 value = (BNI)(op1[0]) * op2[0]; 369 rop[0] = value; 370 carry = (BNS)(value >> BNSBITS); 371 for (j = 1; j < len2; j++) { 372 value = (BNI)(op1[0]) * op2[j] + carry; 373 rop[j] = value; 374 carry = (BNS)(value >> BNSBITS); 375 } 376 rop[j] = carry; 377 } 378 379 /* do the multiplication */ 380 for (i = 1; i < len1; i++) { 381 if (op1[i]) { 382 /* unrool loop initialization */ 383 value = (BNI)(op1[i]) * op2[0] + rop[i]; 384 rop[i] = value; 385 carry = (BNS)(value >> BNSBITS); 386 /* multiply */ 387 for (j = 1; j < len2; j++) { 388 value = (BNI)(op1[i]) * op2[j] + rop[i + j] + carry; 389 rop[i + j] = value; 390 carry = (BNS)(value >> BNSBITS); 391 } 392 rop[i + j] = carry; 393 } 394 } 395 396 if (size > 1 && rop[size - 1] == 0) 397 --size; 398 399 return (size); 400} 401 402 /* Karatsuba method 403 * t + ((a0 + a1) (b0 + b1) - t - u) x + ux² 404 * where t = a0b0 and u = a1b1 405 * 406 * Karatsuba method reduces the number of multiplications. Example: 407 * Square a 40 length number 408 * instead of a plain 40*40 = 1600 multiplies/adds, it does: 409 * 20*20+20*20+20*20 = 1200 410 * but since it is recursive, every 20*20=400 is reduced to 411 * 10*10+10*10+10*10=300 412 * and so on. 413 * The multiplication by x and x² is a just a shift, as it is a 414 * power of two, and is implemented below by just writting at the 415 * correct offset */ 416long 417mp_karatsuba_mul(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2) 418{ 419 BNI x; /* shift count */ 420 BNI la0, la1, lb0, lb1; /* length of a0, a1, b0, and b1 */ 421 BNS *t; /* temporary memory for t product */ 422 BNS *u; /* temporary memory for u product */ 423 BNS *r; /* pointer to rop */ 424 long xlen, tlen, ulen; 425 426 /* calculate value of x, that is 2^(BNSBITS*x) */ 427 if (len1 >= len2) 428 x = (len1 + 1) >> 1; 429 else 430 x = (len2 + 1) >> 1; 431 432 /* calculate length of operands */ 433 la0 = x; 434 la1 = len1 - x; 435 lb0 = x; 436 lb1 = len2 - x; 437 438 /* allocate buffer for t and (a0 + a1) */ 439 tlen = la0 + lb0; 440 t = mp_malloc(sizeof(BNS) * tlen); 441 442 /* allocate buffer for u and (b0 + b1) */ 443 if (la1 + lb1 < lb0 + lb1 + 1) 444 ulen = lb0 + lb1 + 1; 445 else 446 ulen = la1 + lb1; 447 u = mp_malloc(sizeof(BNS) * ulen); 448 449 /* calculate a0 + a1, store result in t */ 450 tlen = mp_add(t, op1, op1 + x, la0, la1); 451 452 /* calculate b0 + b1, store result in u */ 453 ulen = mp_add(u, op2, op2 + x, lb0, lb1); 454 455 /* store (a0 + a1) * (b0 + b1) in rop */ 456 457 r = rop + x; /* multiplied by 2^(BNSBITS*x) */ 458 xlen = mp_mul(r, t, u, tlen, ulen); 459 460 /* must zero t and u memory, this is required for mp_mul */ 461 462 /* calculate t = a0 * b0 */ 463 tlen = la0 + lb0; 464 memset(t, '\0', sizeof(BNS) * tlen); 465 tlen = mp_mul(t, op1, op2, la0, lb0); 466 467 /* calculate u = a1 * b1 */ 468 ulen = la1 + lb1; 469 memset(u, '\0', sizeof(BNS) * ulen); 470 ulen = mp_mul(u, op1 + x, op2 + x, la1, lb1); 471 472 /* subtract t from partial result */ 473 xlen = mp_sub(r, r, t, xlen, tlen); 474 475 /* subtract u form partial result */ 476 xlen = mp_sub(r, r, u, xlen, ulen); 477 478 /* add ux^2 to partial result */ 479 480 r = rop + (x << 1); /* multiplied by x^2 = 2^(BNSBITS*x*2) */ 481 xlen = len1 + len2; 482 xlen = mp_add(r, r, u, xlen, ulen); 483 484 /* now add t to final result */ 485 xlen = mp_add(rop, rop, t, xlen, tlen); 486 487 mp_free(t); 488 mp_free(u); 489 490 if (xlen > 1 && rop[xlen - 1] == 0) 491 --xlen; 492 493 return (xlen); 494} 495 496 /* Toom method (partially based on GMP documentation) 497 * Evaluation at k = [ 0 1/2 1 2 oo ] 498 * U(x) = (U2k + U1)k + U0 499 * V(x) = (V2k + V1)k + V0 500 * W(x) = U(x)V(x) 501 * 502 * Sample: 503 * 123 * 456 504 * 505 * EVALUATION: 506 * U(0) = (1*0+2)*0+3 => 3 507 * U(1) = 1+(2+3*2)*2 => 17 508 * U(2) = 1+2+3 => 6 509 * U(3) = (1*2+2)*2+3 => 11 510 * U(4) = 1+(2+3*0)*0 => 1 511 * 512 * V(0) = (4*0+5)*0+6 => 6 513 * V(1) = 4+(5+6*2)*2 => 38 514 * V(2) = 4+5+6 => 15 515 * V(3) = (4*2+5)*2+6 => 32 516 * V(4) = 4+(5+6*0)*0 => 4 517 * 518 * U = [ 3 17 6 11 1 ] 519 * V = [ 6 38 15 32 4 ] 520 * W = [ 18 646 90 352 4 ] 521 * 522 * After that, we have: 523 * a = 18 (w0 already known) 524 * b = 16w0 + 8w1 + 4w2 + 2w3 + w4 525 * c = w0 + w1 + w2 + w3 + w4 526 * d = w0 + 2w1 + 4w2 + 8w3 + 16w4 527 * e = 4 (w4 already known) 528 * 529 * INTERPOLATION: 530 * b = b -16a - e (354) 531 * c = c - a - e (68) 532 * d = d - a - 16e (270) 533 * 534 * w = (b + d) - 8c = (10w1+8w2+10w3) - (8w1+8w2+8w3) = 2w1+2w3 535 * w = 2c - w (56) 536 * b = b/2 = 4w1+w+w3 537 * b = b-c = 4w1+w+w3 - w1+w2+w3 = 3w1+w2 538 * c = w/2 (w2 = 28) 539 * b = b-c = 3w1+c - c = 3w1 540 * b = b/3 (w1 = 27) 541 * d = d/2 542 * d = d-b-w = b+w+4w3 - b-w = 4w3 543 * d = d/4 (w3 = 13) 544 * 545 * RESULT: 546 * w4*10^4 + w3*10³ + w2*10² + w1*10 + w0 547 * 40000 + 13000 + 2800 + 270 + 18 548 * 10 is the base where the calculation was done 549 * 550 * This sample uses small numbers, so it does not show the 551 * advantage of the method. But for example (in base 10), when squaring 552 * 123456789012345678901234567890 553 * The normal method would do 30*30=900 multiplications 554 * Karatsuba method would do 15*15*3=675 multiplications 555 * Toom method would do 10*10*5=500 multiplications 556 * Toom method has a larger overhead if compared with Karatsuba method, 557 * due to evaluation and interpolation, so it should be used for larger 558 * numbers, so that the computation time of evaluation/interpolation 559 * would be smaller than the time spent using other methods. 560 * 561 * Note that Karatsuba method can be seen as a special case of 562 * Toom method, i.e: 563 * U1U0 * V1V0 564 * with k = [ 0 1 oo ] 565 * U = [ U0 U1+U0 U1 ] 566 * V = [ V0 V1+V0 V1 ] 567 * W = [ U0*V0 (U1+U0)*(V1+V0) (U1+V1) ] 568 * 569 * w0 = U0*V0 570 * w = (U1+U0)*(V1+V0) 571 * w2 = (U1*V1) 572 * 573 * w1 = w - w0 - w2 574 * w2x² + w1x + w0 575 * 576 * See Knuth's Seminumerical Algorithms for a sample implemention 577 * using 4 stacks and k = [ 0 1 2 3 ... ], based on the size of the 578 * input. 579 */ 580long 581mp_toom_mul(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2) 582{ 583 long size, xsize, i; 584 BNI value; /* used in division */ 585 BNS carry; 586 BNI x; /* shift count */ 587 BNI l1, l2; 588 BNI al, bl, cl, dl, el, Ul[3], Vl[3]; 589 BNS *a, *b, *c, *d, *e, *U[3], *V[3]; 590 591 /* x is the base i.e. 2^(BNSBITS*x) */ 592 x = (len1 + len2 + 4) / 6; 593 l1 = len1 - (x << 1); /* length of remaining piece of op1 */ 594 l2 = len2 - (x << 1); /* length of remaining piece of op2 */ 595 596 /* allocate memory for storing U and V */ 597 U[0] = mp_malloc(sizeof(BNS) * (x + 2)); 598 V[0] = mp_malloc(sizeof(BNS) * (x + 2)); 599 U[1] = mp_malloc(sizeof(BNS) * (x + 1)); 600 V[1] = mp_malloc(sizeof(BNS) * (x + 1)); 601 U[2] = mp_malloc(sizeof(BNS) * (x + 2)); 602 V[2] = mp_malloc(sizeof(BNS) * (x + 2)); 603 604 /* EVALUATE U AND V */ 605 606 /* Numbers are in the format U2x²+U1x+U0 and V2x²+V1x+V0 */ 607 608 /* U[0] = U2+U1*2+U0*4 */ 609 610 /* store U1*2 in U[1], this value is used twice */ 611 Ul[1] = mp_lshift(U[1], op1 + x, x, 1); 612 613 /* store U0*4 in U[0] */ 614 Ul[0] = mp_lshift(U[0], op1, x, 2); 615 /* add U1*2 to U[0] */ 616 Ul[0] = mp_add(U[0], U[0], U[1], Ul[0], Ul[1]); 617 /* add U2 to U[0] */ 618 Ul[0] = mp_add(U[0], U[0], op1 + x + x, Ul[0], l1); 619 620 /* U[2] = U2*4+U1*2+U0 */ 621 622 /* store U2*4 in U[2] */ 623 Ul[2] = mp_lshift(U[2], op1 + x + x, l1, 2); 624 /* add U1*2 to U[2] */ 625 Ul[2] = mp_add(U[2], U[2], U[1], Ul[2], Ul[1]); 626 /* add U0 to U[2] */ 627 Ul[2] = mp_add(U[2], U[2], op1, Ul[2], x); 628 629 /* U[1] = U2+U1+U0 */ 630 631 Ul[1] = mp_add(U[1], op1, op1 + x, x, x); 632 Ul[1] = mp_add(U[1], U[1], op1 + x + x, Ul[1], l1); 633 634 635 /* Evaluate V[x], same code as U[x] */ 636 Vl[1] = mp_lshift(V[1], op2 + x, x, 1); 637 Vl[0] = mp_lshift(V[0], op2, x, 2); 638 Vl[0] = mp_add(V[0], V[0], V[1], Vl[0], Vl[1]); 639 Vl[0] = mp_add(V[0], V[0], op2 + x + x, Vl[0], l2); 640 Vl[2] = mp_lshift(V[2], op2 + x + x, l2, 2); 641 Vl[2] = mp_add(V[2], V[2], V[1], Vl[2], Vl[1]); 642 Vl[2] = mp_add(V[2], V[2], op2, Vl[2], x); 643 Vl[1] = mp_add(V[1], op2, op2 + x, x, x); 644 Vl[1] = mp_add(V[1], V[1], op2 + x + x, Vl[1], l2); 645 646 647 /* MULTIPLY U[] AND V[] */ 648 649 /* calculate (U2+U1*2+U0*4) * (V2+V1*2+V0*4) */ 650 b = mp_calloc(1, sizeof(BNS) * (Ul[0] * Vl[0])); 651 bl = mp_mul(b, U[0], V[0], Ul[0], Vl[0]); 652 mp_free(U[0]); 653 mp_free(V[0]); 654 655 /* calculate (U2+U1+U0) * (V2+V1+V0) */ 656 c = mp_calloc(1, sizeof(BNS) * (Ul[1] * Vl[1])); 657 cl = mp_mul(c, U[1], V[1], Ul[1], Vl[1]); 658 mp_free(U[1]); 659 mp_free(V[1]); 660 661 /* calculate (U2*4+U1*2+U0) * (V2*4+V1*2+V0) */ 662 d = mp_calloc(1, sizeof(BNS) * (Ul[2] * Vl[2])); 663 dl = mp_mul(d, U[2], V[2], Ul[2], Vl[2]); 664 mp_free(U[2]); 665 mp_free(V[2]); 666 667 /* calculate U0 * V0 */ 668 a = mp_calloc(1, sizeof(BNS) * (x + x)); 669 al = mp_mul(a, op1, op2, x, x); 670 671 /* calculate U2 * V2 */ 672 e = mp_calloc(1, sizeof(BNS) * (l1 + l2)); 673 el = mp_mul(e, op1 + x + x, op2 + x + x, l1, l2); 674 675 676 /* INTERPOLATE COEFFICIENTS */ 677 678 /* b = b - 16a - e */ 679 size = mp_lshift(rop, a, al, 4); 680 bl = mp_sub(b, b, rop, bl, size); 681 bl = mp_sub(b, b, e, bl, el); 682 683 /* c = c - a - e*/ 684 cl = mp_sub(c, c, a, cl, al); 685 cl = mp_sub(c, c, e, cl, el); 686 687 /* d = d - a - 16e */ 688 dl = mp_sub(d, d, a, dl, al); 689 size = mp_lshift(rop, e, el, 4); 690 dl = mp_sub(d, d, rop, dl, size); 691 692 /* w = (b + d) - 8c */ 693 size = mp_add(rop, b, d, bl, dl); 694 xsize = mp_lshift(rop + size, c, cl, 3); /* rop has enough storage */ 695 size = mp_sub(rop, rop, rop + size, size, xsize); 696 697 /* w = 2c - w*/ 698 xsize = mp_lshift(rop + size, c, cl, 1); 699 size = mp_sub(rop, rop + size, rop, xsize, size); 700 701 /* b = b/2 */ 702 bl = mp_rshift(b, b, bl, 1); 703 704 /* b = b - c */ 705 bl = mp_sub(b, b, c, bl, cl); 706 707 /* c = w / 2 */ 708 cl = mp_rshift(c, rop, size, 1); 709 710 /* b = b - c */ 711 bl = mp_sub(b, b, c, bl, cl); 712 713 /* b = b/3 */ 714 /* maybe the most expensive calculation */ 715 i = bl - 1; 716 value = b[i]; 717 b[i] = value / 3; 718 for (--i; i >= 0; i--) { 719 carry = value % 3; 720 value = ((BNI)carry << BNSBITS) + b[i]; 721 b[i] = (BNS)(value / 3); 722 } 723 724 /* d = d/2 */ 725 dl = mp_rshift(d, d, dl, 1); 726 727 /* d = d - b - w */ 728 dl = mp_sub(d, d, b, dl, bl); 729 dl = mp_sub(d, d, rop, dl, size); 730 731 /* d = d/4 */ 732 dl = mp_rshift(d, d, dl, 2); 733 734 735 /* STORE RESULT IN ROP */ 736 /* first clear memory used as temporary variable w and 8c */ 737 memset(rop, '\0', sizeof(BNS) * (len1 + len2)); 738 739 i = x * 4; 740 xsize = (len1 + len2) - i; 741 size = mp_add(rop + i, rop + i, e, xsize, el) + i; 742 i = x * 3; 743 xsize = size - i; 744 size = mp_add(rop + i, rop + i, d, xsize, dl) + i; 745 i = x * 2; 746 xsize = size - i; 747 size = mp_add(rop + i, rop + i, c, xsize, cl) + i; 748 i = x; 749 xsize = size - i; 750 size = mp_add(rop + i, rop + i, b, xsize, bl) + i; 751 size = mp_add(rop, rop, a, size, al); 752 753 mp_free(e); 754 mp_free(d); 755 mp_free(c); 756 mp_free(b); 757 mp_free(a); 758 759 if (size > 1 && rop[size - 1] == 0) 760 --size; 761 762 return (size); 763} 764 765long 766mp_mul(BNS *rop, BNS *op1, BNS *op2, BNI len1, BNI len2) 767{ 768 if (len1 < len2) 769 MP_SWAP(op1, op2, len1, len2); 770 771 if (len1 < KARATSUBA || len2 < KARATSUBA) 772 return (mp_base_mul(rop, op1, op2, len1, len2)); 773 else if (len1 < TOOM && len2 < TOOM && len2 > ((len1 + 1) >> 1)) 774 return (mp_karatsuba_mul(rop, op1, op2, len1, len2)); 775 else if (len1 >= TOOM && len2 >= TOOM && (len2 + 2) / 3 == (len1 + 2) / 3) 776 return (mp_toom_mul(rop, op1, op2, len1, len2)); 777 else { 778 long xsize, psize, isize; 779 BNS *ptr; 780 781 /* adjust index pointer and estimated size of result */ 782 isize = 0; 783 xsize = len1 + len2; 784 mp_mul(rop, op1, op2, len2, len2); 785 /* adjust pointers */ 786 len1 -= len2; 787 op1 += len2; 788 789 /* allocate buffer for intermediate multiplications */ 790 if (len1 > len2) 791 ptr = mp_calloc(1, sizeof(BNS) * (len2 + len2)); 792 else 793 ptr = mp_calloc(1, sizeof(BNS) * (len1 + len2)); 794 795 /* loop multiplying len2 size operands at a time */ 796 while (len1 >= len2) { 797 isize += len2; 798 psize = mp_mul(ptr, op1, op2, len2, len2); 799 mp_add(rop + isize, rop + isize, ptr, xsize - isize, psize); 800 len1 -= len2; 801 op1 += len2; 802 803 /* multiplication routines require zeroed memory */ 804 memset(ptr, '\0', sizeof(BNS) * (MIN(len1, len2) + len2)); 805 } 806 807 /* len1 was not a multiple of len2 */ 808 if (len1) { 809 isize += len2; 810 psize = mp_mul(ptr, op2, op1, len2, len1); 811 mp_add(rop + isize, rop + isize, ptr, xsize, psize); 812 } 813 814 /* adjust result size */ 815 if (rop[xsize - 1] == 0) 816 --xsize; 817 818 mp_free(ptr); 819 820 return (xsize); 821 } 822} 823