sshbn.c

   1 /*
   2  * Bignum routines for RSA and DH and stuff.
   3  */
   4
   5 #include <stdio.h>
   6 #include <assert.h>
   7 #include <stdlib.h>
   8 #include <string.h>
   9 #include <limits.h>
  10 #include <ctype.h>
  11
  12 #include "misc.h"
  13
  14 #include "sshbn.h"
  15
  16 #define BIGNUM_INTERNAL
  17 typedef BignumInt *Bignum;
  18
  19 #include "ssh.h"
  20
  21 BignumInt bnZero[1] = { 0 };
  22 BignumInt bnOne[2] = { 1, 1 };
  23 BignumInt bnTen[2] = { 1, 10 };
  24
  25 /*
  26  * The Bignum format is an array of `BignumInt'. The first
  27  * element of the array counts the remaining elements. The
  28  * remaining elements express the actual number, base 2^BIGNUM_INT_BITS, _least_
  29  * significant digit first. (So it's trivial to extract the bit
  30  * with value 2^n for any n.)
  31  *
  32  * All Bignums in this module are positive. Negative numbers must
  33  * be dealt with outside it.
  34  *
  35  * INVARIANT: the most significant word of any Bignum must be
  36  * nonzero.
  37  */
  38
  39 Bignum Zero = bnZero, One = bnOne, Ten = bnTen;
  40
  41 static Bignum newbn(int length)
  42 {
  43     Bignum b;
  44
  45     assert(length >= 0 && length < INT_MAX / BIGNUM_INT_BITS);
  46
  47     b = snewn(length + 1, BignumInt);
  48     memset(b, 0, (length + 1) * sizeof(*b));
  49     b[0] = length;
  50     return b;
  51 }
  52
  53 void bn_restore_invariant(Bignum b)
  54 {
  55     while (b[0] > 1 && b[b[0]] == 0)
  56         b[0]--;
  57 }
  58
  59 Bignum copybn(Bignum orig)
  60 {
  61     Bignum b = snewn(orig[0] + 1, BignumInt);
  62     if (!b)
  63         abort();                       /* FIXME */
  64     memcpy(b, orig, (orig[0] + 1) * sizeof(*b));
  65     return b;
  66 }
  67
  68 void freebn(Bignum b)
  69 {
  70     /*
  71      * Burn the evidence, just in case.
  72      */
  73     smemclr(b, sizeof(b[0]) * (b[0] + 1));
  74     sfree(b);
  75 }
  76
  77 Bignum bn_power_2(int n)
  78 {
  79     Bignum ret;
  80
  81     assert(n >= 0);
  82
  83     ret = newbn(n / BIGNUM_INT_BITS + 1);
  84     bignum_set_bit(ret, n, 1);
  85     return ret;
  86 }
  87
  88 /*
  89  * Internal addition. Sets c = a - b, where 'a', 'b' and 'c' are all
  90  * big-endian arrays of 'len' BignumInts. Returns the carry off the
  91  * top.
  92  */
  93 static BignumCarry internal_add(const BignumInt *a, const BignumInt *b,
  94                                 BignumInt *c, int len)
  95 {
  96     int i;
  97     BignumCarry carry = 0;
  98
  99     for (i = len-1; i >= 0; i--)
 100         BignumADC(c[i], carry, a[i], b[i], carry);
 101
 102     return (BignumInt)carry;
 103 }
 104
 105 /*
 106  * Internal subtraction. Sets c = a - b, where 'a', 'b' and 'c' are
 107  * all big-endian arrays of 'len' BignumInts. Any borrow from the top
 108  * is ignored.
 109  */
 110 static void internal_sub(const BignumInt *a, const BignumInt *b,
 111                          BignumInt *c, int len)
 112 {
 113     int i;
 114     BignumCarry carry = 1;
 115
 116     for (i = len-1; i >= 0; i--)
 117         BignumADC(c[i], carry, a[i], ~b[i], carry);
 118 }
 119
 120 /*
 121  * Compute c = a * b.
 122  * Input is in the first len words of a and b.
 123  * Result is returned in the first 2*len words of c.
 124  *
 125  * 'scratch' must point to an array of BignumInt of size at least
 126  * mul_compute_scratch(len). (This covers the needs of internal_mul
 127  * and all its recursive calls to itself.)
 128  */
 129 #define KARATSUBA_THRESHOLD 50
 130 static int mul_compute_scratch(int len)
 131 {
 132     int ret = 0;
 133     while (len > KARATSUBA_THRESHOLD) {
 134         int toplen = len/2, botlen = len - toplen; /* botlen is the bigger */
 135         int midlen = botlen + 1;
 136         ret += 4*midlen;
 137         len = midlen;
 138     }
 139     return ret;
 140 }
 141 static void internal_mul(const BignumInt *a, const BignumInt *b,
 142                          BignumInt *c, int len, BignumInt *scratch)
 143 {
 144     if (len > KARATSUBA_THRESHOLD) {
 145         int i;
 146
 147         /*
 148          * Karatsuba divide-and-conquer algorithm. Cut each input in
 149          * half, so that it's expressed as two big 'digits' in a giant
 150          * base D:
 151          *
 152          *   a = a_1 D + a_0
 153          *   b = b_1 D + b_0
 154          *
 155          * Then the product is of course
 156          *
 157          *  ab = a_1 b_1 D^2 + (a_1 b_0 + a_0 b_1) D + a_0 b_0
 158          *
 159          * and we compute the three coefficients by recursively
 160          * calling ourself to do half-length multiplications.
 161          *
 162          * The clever bit that makes this worth doing is that we only
 163          * need _one_ half-length multiplication for the central
 164          * coefficient rather than the two that it obviouly looks
 165          * like, because we can use a single multiplication to compute
 166          *
 167          *   (a_1 + a_0) (b_1 + b_0) = a_1 b_1 + a_1 b_0 + a_0 b_1 + a_0 b_0
 168          *
 169          * and then we subtract the other two coefficients (a_1 b_1
 170          * and a_0 b_0) which we were computing anyway.
 171          *
 172          * Hence we get to multiply two numbers of length N in about
 173          * three times as much work as it takes to multiply numbers of
 174          * length N/2, which is obviously better than the four times
 175          * as much work it would take if we just did a long
 176          * conventional multiply.
 177          */
 178
 179         int toplen = len/2, botlen = len - toplen; /* botlen is the bigger */
 180         int midlen = botlen + 1;
 181         BignumCarry carry;
 182 #ifdef KARA_DEBUG
 183         int i;
 184 #endif
 185
 186         /*
 187          * The coefficients a_1 b_1 and a_0 b_0 just avoid overlapping
 188          * in the output array, so we can compute them immediately in
 189          * place.
 190          */
 191
 192 #ifdef KARA_DEBUG
 193         printf("a1,a0 = 0x");
 194         for (i = 0; i < len; i++) {
 195             if (i == toplen) printf(", 0x");
 196             printf("%0*x", BIGNUM_INT_BITS/4, a[i]);
 197         }
 198         printf("\n");
 199         printf("b1,b0 = 0x");
 200         for (i = 0; i < len; i++) {
 201             if (i == toplen) printf(", 0x");
 202             printf("%0*x", BIGNUM_INT_BITS/4, b[i]);
 203         }
 204         printf("\n");
 205 #endif
 206
 207         /* a_1 b_1 */
 208         internal_mul(a, b, c, toplen, scratch);
 209 #ifdef KARA_DEBUG
 210         printf("a1b1 = 0x");
 211         for (i = 0; i < 2*toplen; i++) {
 212             printf("%0*x", BIGNUM_INT_BITS/4, c[i]);
 213         }
 214         printf("\n");
 215 #endif
 216
 217         /* a_0 b_0 */
 218         internal_mul(a + toplen, b + toplen, c + 2*toplen, botlen, scratch);
 219 #ifdef KARA_DEBUG
 220         printf("a0b0 = 0x");
 221         for (i = 0; i < 2*botlen; i++) {
 222             printf("%0*x", BIGNUM_INT_BITS/4, c[2*toplen+i]);
 223         }
 224         printf("\n");
 225 #endif
 226
 227         /* Zero padding. midlen exceeds toplen by at most 2, so just
 228          * zero the first two words of each input and the rest will be
 229          * copied over. */
 230         scratch[0] = scratch[1] = scratch[midlen] = scratch[midlen+1] = 0;
 231
 232         for (i = 0; i < toplen; i++) {
 233             scratch[midlen - toplen + i] = a[i]; /* a_1 */
 234             scratch[2*midlen - toplen + i] = b[i]; /* b_1 */
 235         }
 236
 237         /* compute a_1 + a_0 */
 238         scratch[0] = internal_add(scratch+1, a+toplen, scratch+1, botlen);
 239 #ifdef KARA_DEBUG
 240         printf("a1plusa0 = 0x");
 241         for (i = 0; i < midlen; i++) {
 242             printf("%0*x", BIGNUM_INT_BITS/4, scratch[i]);
 243         }
 244         printf("\n");
 245 #endif
 246         /* compute b_1 + b_0 */
 247         scratch[midlen] = internal_add(scratch+midlen+1, b+toplen,
 248                                        scratch+midlen+1, botlen);
 249 #ifdef KARA_DEBUG
 250         printf("b1plusb0 = 0x");
 251         for (i = 0; i < midlen; i++) {
 252             printf("%0*x", BIGNUM_INT_BITS/4, scratch[midlen+i]);
 253         }
 254         printf("\n");
 255 #endif
 256
 257         /*
 258          * Now we can do the third multiplication.
 259          */
 260         internal_mul(scratch, scratch + midlen, scratch + 2*midlen, midlen,
 261                      scratch + 4*midlen);
 262 #ifdef KARA_DEBUG
 263         printf("a1plusa0timesb1plusb0 = 0x");
 264         for (i = 0; i < 2*midlen; i++) {
 265             printf("%0*x", BIGNUM_INT_BITS/4, scratch[2*midlen+i]);
 266         }
 267         printf("\n");
 268 #endif
 269
 270         /*
 271          * Now we can reuse the first half of 'scratch' to compute the
 272          * sum of the outer two coefficients, to subtract from that
 273          * product to obtain the middle one.
 274          */
 275         scratch[0] = scratch[1] = scratch[2] = scratch[3] = 0;
 276         for (i = 0; i < 2*toplen; i++)
 277             scratch[2*midlen - 2*toplen + i] = c[i];
 278         scratch[1] = internal_add(scratch+2, c + 2*toplen,
 279                                   scratch+2, 2*botlen);
 280 #ifdef KARA_DEBUG
 281         printf("a1b1plusa0b0 = 0x");
 282         for (i = 0; i < 2*midlen; i++) {
 283             printf("%0*x", BIGNUM_INT_BITS/4, scratch[i]);
 284         }
 285         printf("\n");
 286 #endif
 287
 288         internal_sub(scratch + 2*midlen, scratch,
 289                      scratch + 2*midlen, 2*midlen);
 290 #ifdef KARA_DEBUG
 291         printf("a1b0plusa0b1 = 0x");
 292         for (i = 0; i < 2*midlen; i++) {
 293             printf("%0*x", BIGNUM_INT_BITS/4, scratch[2*midlen+i]);
 294         }
 295         printf("\n");
 296 #endif
 297
 298         /*
 299          * And now all we need to do is to add that middle coefficient
 300          * back into the output. We may have to propagate a carry
 301          * further up the output, but we can be sure it won't
 302          * propagate right the way off the top.
 303          */
 304         carry = internal_add(c + 2*len - botlen - 2*midlen,
 305                              scratch + 2*midlen,
 306                              c + 2*len - botlen - 2*midlen, 2*midlen);
 307         i = 2*len - botlen - 2*midlen - 1;
 308         while (carry) {
 309             assert(i >= 0);
 310             BignumADC(c[i], carry, c[i], 0, carry);
 311             i--;
 312         }
 313 #ifdef KARA_DEBUG
 314         printf("ab = 0x");
 315         for (i = 0; i < 2*len; i++) {
 316             printf("%0*x", BIGNUM_INT_BITS/4, c[i]);
 317         }
 318         printf("\n");
 319 #endif
 320
 321     } else {
 322         int i;
 323         BignumInt carry;
 324         const BignumInt *ap, *bp;
 325         BignumInt *cp, *cps;
 326
 327         /*
 328          * Multiply in the ordinary O(N^2) way.
 329          */
 330
 331         for (i = 0; i < 2 * len; i++)
 332             c[i] = 0;
 333
 334         for (cps = c + 2*len, ap = a + len; ap-- > a; cps--) {
 335             carry = 0;
 336             for (cp = cps, bp = b + len; cp--, bp-- > b ;)
 337                 BignumMULADD2(carry, *cp, *ap, *bp, *cp, carry);
 338             *cp = carry;
 339         }
 340     }
 341 }
 342
 343 /*
 344  * Variant form of internal_mul used for the initial step of
 345  * Montgomery reduction. Only bothers outputting 'len' words
 346  * (everything above that is thrown away).
 347  */
 348 static void internal_mul_low(const BignumInt *a, const BignumInt *b,
 349                              BignumInt *c, int len, BignumInt *scratch)
 350 {
 351     if (len > KARATSUBA_THRESHOLD) {
 352         int i;
 353
 354         /*
 355          * Karatsuba-aware version of internal_mul_low. As before, we
 356          * express each input value as a shifted combination of two
 357          * halves:
 358          *
 359          *   a = a_1 D + a_0
 360          *   b = b_1 D + b_0
 361          *
 362          * Then the full product is, as before,
 363          *
 364          *  ab = a_1 b_1 D^2 + (a_1 b_0 + a_0 b_1) D + a_0 b_0
 365          *
 366          * Provided we choose D on the large side (so that a_0 and b_0
 367          * are _at least_ as long as a_1 and b_1), we don't need the
 368          * topmost term at all, and we only need half of the middle
 369          * term. So there's no point in doing the proper Karatsuba
 370          * optimisation which computes the middle term using the top
 371          * one, because we'd take as long computing the top one as
 372          * just computing the middle one directly.
 373          *
 374          * So instead, we do a much more obvious thing: we call the
 375          * fully optimised internal_mul to compute a_0 b_0, and we
 376          * recursively call ourself to compute the _bottom halves_ of
 377          * a_1 b_0 and a_0 b_1, each of which we add into the result
 378          * in the obvious way.
 379          *
 380          * In other words, there's no actual Karatsuba _optimisation_
 381          * in this function; the only benefit in doing it this way is
 382          * that we call internal_mul proper for a large part of the
 383          * work, and _that_ can optimise its operation.
 384          */
 385
 386         int toplen = len/2, botlen = len - toplen; /* botlen is the bigger */
 387
 388         /*
 389          * Scratch space for the various bits and pieces we're going
 390          * to be adding together: we need botlen*2 words for a_0 b_0
 391          * (though we may end up throwing away its topmost word), and
 392          * toplen words for each of a_1 b_0 and a_0 b_1. That adds up
 393          * to exactly 2*len.
 394          */
 395
 396         /* a_0 b_0 */
 397         internal_mul(a + toplen, b + toplen, scratch + 2*toplen, botlen,
 398                      scratch + 2*len);
 399
 400         /* a_1 b_0 */
 401         internal_mul_low(a, b + len - toplen, scratch + toplen, toplen,
 402                          scratch + 2*len);
 403
 404         /* a_0 b_1 */
 405         internal_mul_low(a + len - toplen, b, scratch, toplen,
 406                          scratch + 2*len);
 407
 408         /* Copy the bottom half of the big coefficient into place */
 409         for (i = 0; i < botlen; i++)
 410             c[toplen + i] = scratch[2*toplen + botlen + i];
 411
 412         /* Add the two small coefficients, throwing away the returned carry */
 413         internal_add(scratch, scratch + toplen, scratch, toplen);
 414
 415         /* And add that to the large coefficient, leaving the result in c. */
 416         internal_add(scratch, scratch + 2*toplen + botlen - toplen,
 417                      c, toplen);
 418
 419     } else {
 420         int i;
 421         BignumInt carry;
 422         const BignumInt *ap, *bp;
 423         BignumInt *cp, *cps;
 424
 425         /*
 426          * Multiply in the ordinary O(N^2) way.
 427          */
 428
 429         for (i = 0; i < len; i++)
 430             c[i] = 0;
 431
 432         for (cps = c + len, ap = a + len; ap-- > a; cps--) {
 433             carry = 0;
 434             for (cp = cps, bp = b + len; bp--, cp-- > c ;)
 435                 BignumMULADD2(carry, *cp, *ap, *bp, *cp, carry);
 436         }
 437     }
 438 }
 439
 440 /*
 441  * Montgomery reduction. Expects x to be a big-endian array of 2*len
 442  * BignumInts whose value satisfies 0 <= x < rn (where r = 2^(len *
 443  * BIGNUM_INT_BITS) is the Montgomery base). Returns in the same array
 444  * a value x' which is congruent to xr^{-1} mod n, and satisfies 0 <=
 445  * x' < n.
 446  *
 447  * 'n' and 'mninv' should be big-endian arrays of 'len' BignumInts
 448  * each, containing respectively n and the multiplicative inverse of
 449  * -n mod r.
 450  *
 451  * 'tmp' is an array of BignumInt used as scratch space, of length at
 452  * least 3*len + mul_compute_scratch(len).
 453  */
 454 static void monty_reduce(BignumInt *x, const BignumInt *n,
 455                          const BignumInt *mninv, BignumInt *tmp, int len)
 456 {
 457     int i;
 458     BignumInt carry;
 459
 460     /*
 461      * Multiply x by (-n)^{-1} mod r. This gives us a value m such
 462      * that mn is congruent to -x mod r. Hence, mn+x is an exact
 463      * multiple of r, and is also (obviously) congruent to x mod n.
 464      */
 465     internal_mul_low(x + len, mninv, tmp, len, tmp + 3*len);
 466
 467     /*
 468      * Compute t = (mn+x)/r in ordinary, non-modular, integer
 469      * arithmetic. By construction this is exact, and is congruent mod
 470      * n to x * r^{-1}, i.e. the answer we want.
 471      *
 472      * The following multiply leaves that answer in the _most_
 473      * significant half of the 'x' array, so then we must shift it
 474      * down.
 475      */
 476     internal_mul(tmp, n, tmp+len, len, tmp + 3*len);
 477     carry = internal_add(x, tmp+len, x, 2*len);
 478     for (i = 0; i < len; i++)
 479         x[len + i] = x[i], x[i] = 0;
 480
 481     /*
 482      * Reduce t mod n. This doesn't require a full-on division by n,
 483      * but merely a test and single optional subtraction, since we can
 484      * show that 0 <= t < 2n.
 485      *
 486      * Proof:
 487      *  + we computed m mod r, so 0 <= m < r.
 488      *  + so 0 <= mn < rn, obviously
 489      *  + hence we only need 0 <= x < rn to guarantee that 0 <= mn+x < 2rn
 490      *  + yielding 0 <= (mn+x)/r < 2n as required.
 491      */
 492     if (!carry) {
 493         for (i = 0; i < len; i++)
 494             if (x[len + i] != n[i])
 495                 break;
 496     }
 497     if (carry || i >= len || x[len + i] > n[i])
 498         internal_sub(x+len, n, x+len, len);
 499 }
 500
 501 static void internal_add_shifted(BignumInt *number,
 502                                  BignumInt n, int shift)
 503 {
 504     int word = 1 + (shift / BIGNUM_INT_BITS);
 505     int bshift = shift % BIGNUM_INT_BITS;
 506     BignumInt addendh, addendl;
 507     BignumCarry carry;
 508
 509     addendl = n << bshift;
 510     addendh = (bshift == 0 ? 0 : n >> (BIGNUM_INT_BITS - bshift));
 511
 512     assert(word <= number[0]);
 513     BignumADC(number[word], carry, number[word], addendl, 0);
 514     word++;
 515     if (!addendh && !carry)
 516         return;
 517     assert(word <= number[0]);
 518     BignumADC(number[word], carry, number[word], addendh, carry);
 519     word++;
 520     while (carry) {
 521         assert(word <= number[0]);
 522         BignumADC(number[word], carry, number[word], 0, carry);
 523         word++;
 524     }
 525 }
 526
 527 static int bn_clz(BignumInt x)
 528 {
 529     /*
 530      * Count the leading zero bits in x. Equivalently, how far left
 531      * would we need to shift x to make its top bit set?
 532      *
 533      * Precondition: x != 0.
 534      */
 535
 536     /* FIXME: would be nice to put in some compiler intrinsics under
 537      * ifdef here */
 538     int i, ret = 0;
 539     for (i = BIGNUM_INT_BITS / 2; i != 0; i >>= 1) {
 540         if ((x >> (BIGNUM_INT_BITS-i)) == 0) {
 541             x <<= i;
 542             ret += i;
 543         }
 544     }
 545     return ret;
 546 }
 547
 548 static BignumInt reciprocal_word(BignumInt d)
 549 {
 550     BignumInt dshort, recip, prodh, prodl;
 551     int corrections;
 552
 553     /*
 554      * Input: a BignumInt value d, with its top bit set.
 555      */
 556     assert(d >> (BIGNUM_INT_BITS-1) == 1);
 557
 558     /*
 559      * Output: a value, shifted to fill a BignumInt, which is strictly
 560      * less than 1/(d+1), i.e. is an *under*-estimate (but by as
 561      * little as possible within the constraints) of the reciprocal of
 562      * any number whose first BIGNUM_INT_BITS bits match d.
 563      *
 564      * Ideally we'd like to _totally_ fill BignumInt, i.e. always
 565      * return a value with the top bit set. Unfortunately we can't
 566      * quite guarantee that for all inputs and also return a fixed
 567      * exponent. So instead we take our reciprocal to be
 568      * 2^(BIGNUM_INT_BITS*2-1) / d, so that it has the top bit clear
 569      * only in the exceptional case where d takes exactly the maximum
 570      * value BIGNUM_INT_MASK; in that case, the top bit is clear and
 571      * the next bit down is set.
 572      */
 573
 574     /*
 575      * Start by computing a half-length version of the answer, by
 576      * straightforward division within a BignumInt.
 577      */
 578     dshort = (d >> (BIGNUM_INT_BITS/2)) + 1;
 579     recip = (BIGNUM_TOP_BIT + dshort - 1) / dshort;
 580     recip <<= BIGNUM_INT_BITS - BIGNUM_INT_BITS/2;
 581
 582     /*
 583      * Newton-Raphson iteration to improve that starting reciprocal
 584      * estimate: take f(x) = d - 1/x, and then the N-R formula gives
 585      * x_new = x - f(x)/f'(x) = x - (d-1/x)/(1/x^2) = x(2-d*x). Or,
 586      * taking our fixed-point representation into account, take f(x)
 587      * to be d - K/x (where K = 2^(BIGNUM_INT_BITS*2-1) as discussed
 588      * above) and then we get (2K - d*x) * x/K.
 589      *
 590      * Newton-Raphson doubles the number of correct bits at every
 591      * iteration, and the initial division above already gave us half
 592      * the output word, so it's only worth doing one iteration.
 593      */
 594     BignumMULADD(prodh, prodl, recip, d, recip);
 595     prodl = ~prodl;
 596     prodh = ~prodh;
 597     {
 598         BignumCarry c;
 599         BignumADC(prodl, c, prodl, 1, 0);
 600         prodh += c;
 601     }
 602     BignumMUL(prodh, prodl, prodh, recip);
 603     recip = (prodh << 1) | (prodl >> (BIGNUM_INT_BITS-1));
 604
 605     /*
 606      * Now make sure we have the best possible reciprocal estimate,
 607      * before we return it. We might have been off by a handful either
 608      * way - not enough to bother with any better-thought-out kind of
 609      * correction loop.
 610      */
 611     BignumMULADD(prodh, prodl, recip, d, recip);
 612     corrections = 0;
 613     if (prodh >= BIGNUM_TOP_BIT) {
 614         do {
 615             BignumCarry c = 1;
 616             BignumADC(prodl, c, prodl, ~d, c); prodh += BIGNUM_INT_MASK + c;
 617             recip--;
 618             corrections++;
 619         } while (prodh >= ((BignumInt)1 << (BIGNUM_INT_BITS-1)));
 620     } else {
 621         while (1) {
 622             BignumInt newprodh, newprodl;
 623             BignumCarry c = 0;
 624             BignumADC(newprodl, c, prodl, d, c); newprodh = prodh + c;
 625             if (newprodh >= BIGNUM_TOP_BIT)
 626                 break;
 627             prodh = newprodh;
 628             prodl = newprodl;
 629             recip++;
 630             corrections++;
 631         }
 632     }
 633
 634     return recip;
 635 }
 636
 637 /*
 638  * Compute a = a % m.
 639  * Input in first alen words of a and first mlen words of m.
 640  * Output in first alen words of a
 641  * (of which first alen-mlen words will be zero).
 642  * Quotient is accumulated in the `quotient' array, which is a Bignum
 643  * rather than the internal bigendian format.
 644  *
 645  * 'recip' must be the result of calling reciprocal_word() on the top
 646  * BIGNUM_INT_BITS of the modulus (denoted m0 in comments below), with
 647  * the topmost set bit normalised to the MSB of the input to
 648  * reciprocal_word. 'rshift' is how far left the top nonzero word of
 649  * the modulus had to be shifted to set that top bit.
 650  */
 651 static void internal_mod(BignumInt *a, int alen,
 652                          BignumInt *m, int mlen,
 653                          BignumInt *quot, BignumInt recip, int rshift)
 654 {
 655     int i, k;
 656
 657 #ifdef DIVISION_DEBUG
 658     {
 659         int d;
 660         printf("start division, m=0x");
 661         for (d = 0; d < mlen; d++)
 662             printf("%0*llx", BIGNUM_INT_BITS/4, (unsigned long long)m[d]);
 663         printf(", recip=%#0*llx, rshift=%d\n",
 664                BIGNUM_INT_BITS/4, (unsigned long long)recip, rshift);
 665     }
 666 #endif
 667
 668     /*
 669      * Repeatedly use that reciprocal estimate to get a decent number
 670      * of quotient bits, and subtract off the resulting multiple of m.
 671      *
 672      * Normally we expect to terminate this loop by means of finding
 673      * out q=0 part way through, but one way in which we might not get
 674      * that far in the first place is if the input a is actually zero,
 675      * in which case we'll discard zero words from the front of a
 676      * until we reach the termination condition in the for statement
 677      * here.
 678      */
 679     for (i = 0; i <= alen - mlen ;) {
 680         BignumInt product;
 681         BignumInt aword, q;
 682         int shift, full_bitoffset, bitoffset, wordoffset;
 683
 684 #ifdef DIVISION_DEBUG
 685         {
 686             int d;
 687             printf("main loop, a=0x");
 688             for (d = 0; d < alen; d++)
 689                 printf("%0*llx", BIGNUM_INT_BITS/4, (unsigned long long)a[d]);
 690             printf("\n");
 691         }
 692 #endif
 693
 694         if (a[i] == 0) {
 695 #ifdef DIVISION_DEBUG
 696             printf("zero word at i=%d\n", i);
 697 #endif
 698             i++;
 699             continue;
 700         }
 701
 702         aword = a[i];
 703         shift = bn_clz(aword);
 704         aword <<= shift;
 705         if (shift > 0 && i+1 < alen)
 706             aword |= a[i+1] >> (BIGNUM_INT_BITS - shift);
 707
 708         {
 709             BignumInt unused;
 710             BignumMUL(q, unused, recip, aword);
 711             (void)unused;
 712         }
 713
 714 #ifdef DIVISION_DEBUG
 715         printf("i=%d, aword=%#0*llx, shift=%d, q=%#0*llx\n",
 716                i, BIGNUM_INT_BITS/4, (unsigned long long)aword,
 717                shift, BIGNUM_INT_BITS/4, (unsigned long long)q);
 718 #endif
 719
 720         /*
 721          * Work out the right bit and word offsets to use when
 722          * subtracting q*m from a.
 723          *
 724          * aword was taken from a[i], which means its LSB was at bit
 725          * position (alen-1-i) * BIGNUM_INT_BITS. But then we shifted
 726          * it left by 'shift', so now the low bit of aword corresponds
 727          * to bit position (alen-1-i) * BIGNUM_INT_BITS - shift, i.e.
 728          * aword is approximately equal to a / 2^(that).
 729          *
 730          * m0 comes from the top word of mod, so its LSB is at bit
 731          * position (mlen-1) * BIGNUM_INT_BITS - rshift, i.e. it can
 732          * be considered to be m / 2^(that power). 'recip' is the
 733          * reciprocal of m0, times 2^(BIGNUM_INT_BITS*2-1), i.e. it's
 734          * about 2^((mlen+1) * BIGNUM_INT_BITS - rshift - 1) / m.
 735          *
 736          * Hence, recip * aword is approximately equal to the product
 737          * of those, which simplifies to
 738          *
 739          * a/m * 2^((mlen+2+i-alen)*BIGNUM_INT_BITS + shift - rshift - 1)
 740          *
 741          * But we've also shifted recip*aword down by BIGNUM_INT_BITS
 742          * to form q, so we have
 743          *
 744          * q ~= a/m * 2^((mlen+1+i-alen)*BIGNUM_INT_BITS + shift - rshift - 1)
 745          *
 746          * and hence, when we now compute q*m, it will be about
 747          * a*2^(all that lot), i.e. the negation of that expression is
 748          * how far left we have to shift the product q*m to make it
 749          * approximately equal to a.
 750          */
 751         full_bitoffset = -((mlen+1+i-alen)*BIGNUM_INT_BITS + shift-rshift-1);
 752 #ifdef DIVISION_DEBUG
 753         printf("full_bitoffset=%d\n", full_bitoffset);
 754 #endif
 755
 756         if (full_bitoffset < 0) {
 757             /*
 758              * If we find ourselves needing to shift q*m _right_, that
 759              * means we've reached the bottom of the quotient. Clip q
 760              * so that its right shift becomes zero, and if that means
 761              * q becomes _actually_ zero, this loop is done.
 762              */
 763             if (full_bitoffset <= -BIGNUM_INT_BITS)
 764                 break;
 765             q >>= -full_bitoffset;
 766             full_bitoffset = 0;
 767             if (!q)
 768                 break;
 769 #ifdef DIVISION_DEBUG
 770             printf("now full_bitoffset=%d, q=%#0*llx\n",
 771                    full_bitoffset, BIGNUM_INT_BITS/4, (unsigned long long)q);
 772 #endif
 773         }
 774
 775         wordoffset = full_bitoffset / BIGNUM_INT_BITS;
 776         bitoffset = full_bitoffset % BIGNUM_INT_BITS;
 777 #ifdef DIVISION_DEBUG
 778         printf("wordoffset=%d, bitoffset=%d\n", wordoffset, bitoffset);
 779 #endif
 780
 781         /* wordoffset as computed above is the offset between the LSWs
 782          * of m and a. But in fact m and a are stored MSW-first, so we
 783          * need to adjust it to be the offset between the actual array
 784          * indices, and flip the sign too. */
 785         wordoffset = alen - mlen - wordoffset;
 786
 787         if (bitoffset == 0) {
 788             BignumCarry c = 1;
 789             BignumInt prev_hi_word = 0;
 790             for (k = mlen - 1; wordoffset+k >= i; k--) {
 791                 BignumInt mword = k<0 ? 0 : m[k];
 792                 BignumMULADD(prev_hi_word, product, q, mword, prev_hi_word);
 793 #ifdef DIVISION_DEBUG
 794                 printf("  aligned sub: product word for m[%d] = %#0*llx\n",
 795                        k, BIGNUM_INT_BITS/4,
 796                        (unsigned long long)product);
 797 #endif
 798 #ifdef DIVISION_DEBUG
 799                 printf("  aligned sub: subtrahend for a[%d] = %#0*llx\n",
 800                        wordoffset+k, BIGNUM_INT_BITS/4,
 801                        (unsigned long long)product);
 802 #endif
 803                 BignumADC(a[wordoffset+k], c, a[wordoffset+k], ~product, c);
 804             }
 805         } else {
 806             BignumInt add_word = 0;
 807             BignumInt c = 1;
 808             BignumInt prev_hi_word = 0;
 809             for (k = mlen - 1; wordoffset+k >= i; k--) {
 810                 BignumInt mword = k<0 ? 0 : m[k];
 811                 BignumMULADD(prev_hi_word, product, q, mword, prev_hi_word);
 812 #ifdef DIVISION_DEBUG
 813                 printf("  unaligned sub: product word for m[%d] = %#0*llx\n",
 814                        k, BIGNUM_INT_BITS/4,
 815                        (unsigned long long)product);
 816 #endif
 817
 818                 add_word |= product << bitoffset;
 819
 820 #ifdef DIVISION_DEBUG
 821                 printf("  unaligned sub: subtrahend for a[%d] = %#0*llx\n",
 822                        wordoffset+k,
 823                        BIGNUM_INT_BITS/4, (unsigned long long)add_word);
 824 #endif
 825                 BignumADC(a[wordoffset+k], c, a[wordoffset+k], ~add_word, c);
 826
 827                 add_word = product >> (BIGNUM_INT_BITS - bitoffset);
 828             }
 829         }
 830
 831         if (quot) {
 832 #ifdef DIVISION_DEBUG
 833             printf("adding quotient word %#0*llx << %d\n",
 834                    BIGNUM_INT_BITS/4, (unsigned long long)q, full_bitoffset);
 835 #endif
 836             internal_add_shifted(quot, q, full_bitoffset);
 837 #ifdef DIVISION_DEBUG
 838             {
 839                 int d;
 840                 printf("now quot=0x");
 841                 for (d = quot[0]; d > 0; d--)
 842                     printf("%0*llx", BIGNUM_INT_BITS/4,
 843                            (unsigned long long)quot[d]);
 844                 printf("\n");
 845             }
 846 #endif
 847         }
 848     }
 849
 850 #ifdef DIVISION_DEBUG
 851     {
 852         int d;
 853         printf("end main loop, a=0x");
 854         for (d = 0; d < alen; d++)
 855             printf("%0*llx", BIGNUM_INT_BITS/4, (unsigned long long)a[d]);
 856         if (quot) {
 857             printf(", quot=0x");
 858             for (d = quot[0]; d > 0; d--)
 859                 printf("%0*llx", BIGNUM_INT_BITS/4,
 860                        (unsigned long long)quot[d]);
 861         }
 862         printf("\n");
 863     }
 864 #endif
 865
 866     /*
 867      * The above loop should terminate with the remaining value in a
 868      * being strictly less than 2*m (if a >= 2*m then we should always
 869      * have managed to get a nonzero q word), but we can't guarantee
 870      * that it will be strictly less than m: consider a case where the
 871      * remainder is 1, and another where the remainder is m-1. By the
 872      * time a contains a value that's _about m_, you clearly can't
 873      * distinguish those cases by looking at only the top word of a -
 874      * you have to go all the way down to the bottom before you find
 875      * out whether it's just less or just more than m.
 876      *
 877      * Hence, we now do a final fixup in which we subtract one last
 878      * copy of m, or don't, accordingly. We should never have to
 879      * subtract more than one copy of m here.
 880      */
 881     for (i = 0; i < alen; i++) {
 882         /* Compare a with m, word by word, from the MSW down. As soon
 883          * as we encounter a difference, we know whether we need the
 884          * fixup. */
 885         int mindex = mlen-alen+i;
 886         BignumInt mword = mindex < 0 ? 0 : m[mindex];
 887         if (a[i] < mword) {
 888 #ifdef DIVISION_DEBUG
 889             printf("final fixup not needed, a < m\n");
 890 #endif
 891             return;
 892         } else if (a[i] > mword) {
 893 #ifdef DIVISION_DEBUG
 894             printf("final fixup is needed, a > m\n");
 895 #endif
 896             break;
 897         }
 898         /* If neither of those cases happened, the words are the same,
 899          * so keep going and look at the next one. */
 900     }
 901 #ifdef DIVISION_DEBUG
 902     if (i == mlen) /* if we printed neither of the above diagnostics */
 903         printf("final fixup is needed, a == m\n");
 904 #endif
 905
 906     /*
 907      * If we got here without returning, then a >= m, so we must
 908      * subtract m, and increment the quotient.
 909      */
 910     {
 911         BignumCarry c = 1;
 912         for (i = alen - 1; i >= 0; i--) {
 913             int mindex = mlen-alen+i;
 914             BignumInt mword = mindex < 0 ? 0 : m[mindex];
 915             BignumADC(a[i], c, a[i], ~mword, c);
 916         }
 917     }
 918     if (quot)
 919         internal_add_shifted(quot, 1, 0);
 920
 921 #ifdef DIVISION_DEBUG
 922     {
 923         int d;
 924         printf("after final fixup, a=0x");
 925         for (d = 0; d < alen; d++)
 926             printf("%0*llx", BIGNUM_INT_BITS/4, (unsigned long long)a[d]);
 927         if (quot) {
 928             printf(", quot=0x");
 929             for (d = quot[0]; d > 0; d--)
 930                 printf("%0*llx", BIGNUM_INT_BITS/4,
 931                        (unsigned long long)quot[d]);
 932         }
 933         printf("\n");
 934     }
 935 #endif
 936 }
 937
 938 /*
 939  * Compute (base ^ exp) % mod, the pedestrian way.
 940  */
 941 Bignum modpow_simple(Bignum base_in, Bignum exp, Bignum mod)
 942 {
 943     BignumInt *a, *b, *n, *m, *scratch;
 944     BignumInt recip;
 945     int rshift;
 946     int mlen, scratchlen, i, j;
 947     Bignum base, result;
 948
 949     /*
 950      * The most significant word of mod needs to be non-zero. It
 951      * should already be, but let's make sure.
 952      */
 953     assert(mod[mod[0]] != 0);
 954
 955     /*
 956      * Make sure the base is smaller than the modulus, by reducing
 957      * it modulo the modulus if not.
 958      */
 959     base = bigmod(base_in, mod);
 960
 961     /* Allocate m of size mlen, copy mod to m */
 962     /* We use big endian internally */
 963     mlen = mod[0];
 964     m = snewn(mlen, BignumInt);
 965     for (j = 0; j < mlen; j++)
 966         m[j] = mod[mod[0] - j];
 967
 968     /* Allocate n of size mlen, copy base to n */
 969     n = snewn(mlen, BignumInt);
 970     i = mlen - base[0];
 971     for (j = 0; j < i; j++)
 972         n[j] = 0;
 973     for (j = 0; j < (int)base[0]; j++)
 974         n[i + j] = base[base[0] - j];
 975
 976     /* Allocate a and b of size 2*mlen. Set a = 1 */
 977     a = snewn(2 * mlen, BignumInt);
 978     b = snewn(2 * mlen, BignumInt);
 979     for (i = 0; i < 2 * mlen; i++)
 980         a[i] = 0;
 981     a[2 * mlen - 1] = 1;
 982
 983     /* Scratch space for multiplies */
 984     scratchlen = mul_compute_scratch(mlen);
 985     scratch = snewn(scratchlen, BignumInt);
 986
 987     /* Skip leading zero bits of exp. */
 988     i = 0;
 989     j = BIGNUM_INT_BITS-1;
 990     while (i < (int)exp[0] && (exp[exp[0] - i] & ((BignumInt)1 << j)) == 0) {
 991         j--;
 992         if (j < 0) {
 993             i++;
 994             j = BIGNUM_INT_BITS-1;
 995         }
 996     }
 997
 998     /* Compute reciprocal of the top full word of the modulus */
 999     {
1000         BignumInt m0 = m[0];
1001         rshift = bn_clz(m0);
1002         if (rshift) {
1003             m0 <<= rshift;
1004             if (mlen > 1)
1005                 m0 |= m[1] >> (BIGNUM_INT_BITS - rshift);
1006         }
1007         recip = reciprocal_word(m0);
1008     }
1009
1010     /* Main computation */
1011     while (i < (int)exp[0]) {
1012         while (j >= 0) {
1013             internal_mul(a + mlen, a + mlen, b, mlen, scratch);
1014             internal_mod(b, mlen * 2, m, mlen, NULL, recip, rshift);
1015             if ((exp[exp[0] - i] & ((BignumInt)1 << j)) != 0) {
1016                 internal_mul(b + mlen, n, a, mlen, scratch);
1017                 internal_mod(a, mlen * 2, m, mlen, NULL, recip, rshift);
1018             } else {
1019                 BignumInt *t;
1020                 t = a;
1021                 a = b;
1022                 b = t;
1023             }
1024             j--;
1025         }
1026         i++;
1027         j = BIGNUM_INT_BITS-1;
1028     }
1029
1030     /* Copy result to buffer */
1031     result = newbn(mod[0]);
1032     for (i = 0; i < mlen; i++)
1033         result[result[0] - i] = a[i + mlen];
1034     while (result[0] > 1 && result[result[0]] == 0)
1035         result[0]--;
1036
1037     /* Free temporary arrays */
1038     smemclr(a, 2 * mlen * sizeof(*a));
1039     sfree(a);
1040     smemclr(scratch, scratchlen * sizeof(*scratch));
1041     sfree(scratch);
1042     smemclr(b, 2 * mlen * sizeof(*b));
1043     sfree(b);
1044     smemclr(m, mlen * sizeof(*m));
1045     sfree(m);
1046     smemclr(n, mlen * sizeof(*n));
1047     sfree(n);
1048
1049     freebn(base);
1050
1051     return result;
1052 }
1053
1054 /*
1055  * Compute (base ^ exp) % mod. Uses the Montgomery multiplication
1056  * technique where possible, falling back to modpow_simple otherwise.
1057  */
1058 Bignum modpow(Bignum base_in, Bignum exp, Bignum mod)
1059 {
1060     BignumInt *a, *b, *x, *n, *mninv, *scratch;
1061     int len, scratchlen, i, j;
1062     Bignum base, base2, r, rn, inv, result;
1063
1064     /*
1065      * The most significant word of mod needs to be non-zero. It
1066      * should already be, but let's make sure.
1067      */
1068     assert(mod[mod[0]] != 0);
1069
1070     /*
1071      * mod had better be odd, or we can't do Montgomery multiplication
1072      * using a power of two at all.
1073      */
1074     if (!(mod[1] & 1))
1075         return modpow_simple(base_in, exp, mod);
1076
1077     /*
1078      * Make sure the base is smaller than the modulus, by reducing
1079      * it modulo the modulus if not.
1080      */
1081     base = bigmod(base_in, mod);
1082
1083     /*
1084      * Compute the inverse of n mod r, for monty_reduce. (In fact we
1085      * want the inverse of _minus_ n mod r, but we'll sort that out
1086      * below.)
1087      */
1088     len = mod[0];
1089     r = bn_power_2(BIGNUM_INT_BITS * len);
1090     inv = modinv(mod, r);
1091     assert(inv); /* cannot fail, since mod is odd and r is a power of 2 */
1092
1093     /*
1094      * Multiply the base by r mod n, to get it into Montgomery
1095      * representation.
1096      */
1097     base2 = modmul(base, r, mod);
1098     freebn(base);
1099     base = base2;
1100
1101     rn = bigmod(r, mod);               /* r mod n, i.e. Montgomerified 1 */
1102
1103     freebn(r);                         /* won't need this any more */
1104
1105     /*
1106      * Set up internal arrays of the right lengths, in big-endian
1107      * format, containing the base, the modulus, and the modulus's
1108      * inverse.
1109      */
1110     n = snewn(len, BignumInt);
1111     for (j = 0; j < len; j++)
1112         n[len - 1 - j] = mod[j + 1];
1113
1114     mninv = snewn(len, BignumInt);
1115     for (j = 0; j < len; j++)
1116         mninv[len - 1 - j] = (j < (int)inv[0] ? inv[j + 1] : 0);
1117     freebn(inv);         /* we don't need this copy of it any more */
1118     /* Now negate mninv mod r, so it's the inverse of -n rather than +n. */
1119     x = snewn(len, BignumInt);
1120     for (j = 0; j < len; j++)
1121         x[j] = 0;
1122     internal_sub(x, mninv, mninv, len);
1123
1124     /* x = snewn(len, BignumInt); */ /* already done above */
1125     for (j = 0; j < len; j++)
1126         x[len - 1 - j] = (j < (int)base[0] ? base[j + 1] : 0);
1127     freebn(base);        /* we don't need this copy of it any more */
1128
1129     a = snewn(2*len, BignumInt);
1130     b = snewn(2*len, BignumInt);
1131     for (j = 0; j < len; j++)
1132         a[2*len - 1 - j] = (j < (int)rn[0] ? rn[j + 1] : 0);
1133     freebn(rn);
1134
1135     /* Scratch space for multiplies */
1136     scratchlen = 3*len + mul_compute_scratch(len);
1137     scratch = snewn(scratchlen, BignumInt);
1138
1139     /* Skip leading zero bits of exp. */
1140     i = 0;
1141     j = BIGNUM_INT_BITS-1;
1142     while (i < (int)exp[0] && (exp[exp[0] - i] & ((BignumInt)1 << j)) == 0) {
1143         j--;
1144         if (j < 0) {
1145             i++;
1146             j = BIGNUM_INT_BITS-1;
1147         }
1148     }
1149
1150     /* Main computation */
1151     while (i < (int)exp[0]) {
1152         while (j >= 0) {
1153             internal_mul(a + len, a + len, b, len, scratch);
1154             monty_reduce(b, n, mninv, scratch, len);
1155             if ((exp[exp[0] - i] & ((BignumInt)1 << j)) != 0) {
1156                 internal_mul(b + len, x, a, len,  scratch);
1157                 monty_reduce(a, n, mninv, scratch, len);
1158             } else {
1159                 BignumInt *t;
1160                 t = a;
1161                 a = b;
1162                 b = t;
1163             }
1164             j--;
1165         }
1166         i++;
1167         j = BIGNUM_INT_BITS-1;
1168     }
1169
1170     /*
1171      * Final monty_reduce to get back from the adjusted Montgomery
1172      * representation.
1173      */
1174     monty_reduce(a, n, mninv, scratch, len);
1175
1176     /* Copy result to buffer */
1177     result = newbn(mod[0]);
1178     for (i = 0; i < len; i++)
1179         result[result[0] - i] = a[i + len];
1180     while (result[0] > 1 && result[result[0]] == 0)
1181         result[0]--;
1182
1183     /* Free temporary arrays */
1184     smemclr(scratch, scratchlen * sizeof(*scratch));
1185     sfree(scratch);
1186     smemclr(a, 2 * len * sizeof(*a));
1187     sfree(a);
1188     smemclr(b, 2 * len * sizeof(*b));
1189     sfree(b);
1190     smemclr(mninv, len * sizeof(*mninv));
1191     sfree(mninv);
1192     smemclr(n, len * sizeof(*n));
1193     sfree(n);
1194     smemclr(x, len * sizeof(*x));
1195     sfree(x);
1196
1197     return result;
1198 }
1199
1200 /*
1201  * Compute (p * q) % mod.
1202  * The most significant word of mod MUST be non-zero.
1203  * We assume that the result array is the same size as the mod array.
1204  */
1205 Bignum modmul(Bignum p, Bignum q, Bignum mod)
1206 {
1207     BignumInt *a, *n, *m, *o, *scratch;
1208     BignumInt recip;
1209     int rshift, scratchlen;
1210     int pqlen, mlen, rlen, i, j;
1211     Bignum result;
1212
1213     /*
1214      * The most significant word of mod needs to be non-zero. It
1215      * should already be, but let's make sure.
1216      */
1217     assert(mod[mod[0]] != 0);
1218
1219     /* Allocate m of size mlen, copy mod to m */
1220     /* We use big endian internally */
1221     mlen = mod[0];
1222     m = snewn(mlen, BignumInt);
1223     for (j = 0; j < mlen; j++)
1224         m[j] = mod[mod[0] - j];
1225
1226     pqlen = (p[0] > q[0] ? p[0] : q[0]);
1227
1228     /*
1229      * Make sure that we're allowing enough space. The shifting below
1230      * will underflow the vectors we allocate if pqlen is too small.
1231      */
1232     if (2*pqlen <= mlen)
1233         pqlen = mlen/2 + 1;
1234
1235     /* Allocate n of size pqlen, copy p to n */
1236     n = snewn(pqlen, BignumInt);
1237     i = pqlen - p[0];
1238     for (j = 0; j < i; j++)
1239         n[j] = 0;
1240     for (j = 0; j < (int)p[0]; j++)
1241         n[i + j] = p[p[0] - j];
1242
1243     /* Allocate o of size pqlen, copy q to o */
1244     o = snewn(pqlen, BignumInt);
1245     i = pqlen - q[0];
1246     for (j = 0; j < i; j++)
1247         o[j] = 0;
1248     for (j = 0; j < (int)q[0]; j++)
1249         o[i + j] = q[q[0] - j];
1250
1251     /* Allocate a of size 2*pqlen for result */
1252     a = snewn(2 * pqlen, BignumInt);
1253
1254     /* Scratch space for multiplies */
1255     scratchlen = mul_compute_scratch(pqlen);
1256     scratch = snewn(scratchlen, BignumInt);
1257
1258     /* Compute reciprocal of the top full word of the modulus */
1259     {
1260         BignumInt m0 = m[0];
1261         rshift = bn_clz(m0);
1262         if (rshift) {
1263             m0 <<= rshift;
1264             if (mlen > 1)
1265                 m0 |= m[1] >> (BIGNUM_INT_BITS - rshift);
1266         }
1267         recip = reciprocal_word(m0);
1268     }
1269
1270     /* Main computation */
1271     internal_mul(n, o, a, pqlen, scratch);
1272     internal_mod(a, pqlen * 2, m, mlen, NULL, recip, rshift);
1273
1274     /* Copy result to buffer */
1275     rlen = (mlen < pqlen * 2 ? mlen : pqlen * 2);
1276     result = newbn(rlen);
1277     for (i = 0; i < rlen; i++)
1278         result[result[0] - i] = a[i + 2 * pqlen - rlen];
1279     while (result[0] > 1 && result[result[0]] == 0)
1280         result[0]--;
1281
1282     /* Free temporary arrays */
1283     smemclr(scratch, scratchlen * sizeof(*scratch));
1284     sfree(scratch);
1285     smemclr(a, 2 * pqlen * sizeof(*a));
1286     sfree(a);
1287     smemclr(m, mlen * sizeof(*m));
1288     sfree(m);
1289     smemclr(n, pqlen * sizeof(*n));
1290     sfree(n);
1291     smemclr(o, pqlen * sizeof(*o));
1292     sfree(o);
1293
1294     return result;
1295 }
1296
1297 Bignum modsub(const Bignum a, const Bignum b, const Bignum n)
1298 {
1299     Bignum a1, b1, ret;
1300
1301     if (bignum_cmp(a, n) >= 0) a1 = bigmod(a, n);
1302     else a1 = a;
1303     if (bignum_cmp(b, n) >= 0) b1 = bigmod(b, n);
1304     else b1 = b;
1305
1306     if (bignum_cmp(a1, b1) >= 0) /* a >= b */
1307     {
1308         ret = bigsub(a1, b1);
1309     }
1310     else
1311     {
1312         /* Handle going round the corner of the modulus without having
1313          * negative support in Bignum */
1314         Bignum tmp = bigsub(n, b1);
1315         assert(tmp);
1316         ret = bigadd(tmp, a1);
1317         freebn(tmp);
1318     }
1319
1320     if (a != a1) freebn(a1);
1321     if (b != b1) freebn(b1);
1322
1323     return ret;
1324 }
1325
1326 /*
1327  * Compute p % mod.
1328  * The most significant word of mod MUST be non-zero.
1329  * We assume that the result array is the same size as the mod array.
1330  * We optionally write out a quotient if `quotient' is non-NULL.
1331  * We can avoid writing out the result if `result' is NULL.
1332  */
1333 static void bigdivmod(Bignum p, Bignum mod, Bignum result, Bignum quotient)
1334 {
1335     BignumInt *n, *m;
1336     BignumInt recip;
1337     int rshift;
1338     int plen, mlen, i, j;
1339
1340     /*
1341      * The most significant word of mod needs to be non-zero. It
1342      * should already be, but let's make sure.
1343      */
1344     assert(mod[mod[0]] != 0);
1345
1346     /* Allocate m of size mlen, copy mod to m */
1347     /* We use big endian internally */
1348     mlen = mod[0];
1349     m = snewn(mlen, BignumInt);
1350     for (j = 0; j < mlen; j++)
1351         m[j] = mod[mod[0] - j];
1352
1353     plen = p[0];
1354     /* Ensure plen > mlen */
1355     if (plen <= mlen)
1356         plen = mlen + 1;
1357
1358     /* Allocate n of size plen, copy p to n */
1359     n = snewn(plen, BignumInt);
1360     for (j = 0; j < plen; j++)
1361         n[j] = 0;
1362     for (j = 1; j <= (int)p[0]; j++)
1363         n[plen - j] = p[j];
1364
1365     /* Compute reciprocal of the top full word of the modulus */
1366     {
1367         BignumInt m0 = m[0];
1368         rshift = bn_clz(m0);
1369         if (rshift) {
1370             m0 <<= rshift;
1371             if (mlen > 1)
1372                 m0 |= m[1] >> (BIGNUM_INT_BITS - rshift);
1373         }
1374         recip = reciprocal_word(m0);
1375     }
1376
1377     /* Main computation */
1378     internal_mod(n, plen, m, mlen, quotient, recip, rshift);
1379
1380     /* Copy result to buffer */
1381     if (result) {
1382         for (i = 1; i <= (int)result[0]; i++) {
1383             int j = plen - i;
1384             result[i] = j >= 0 ? n[j] : 0;
1385         }
1386     }
1387
1388     /* Free temporary arrays */
1389     smemclr(m, mlen * sizeof(*m));
1390     sfree(m);
1391     smemclr(n, plen * sizeof(*n));
1392     sfree(n);
1393 }
1394
1395 /*
1396  * Decrement a number.
1397  */
1398 void decbn(Bignum bn)
1399 {
1400     int i = 1;
1401     while (i < (int)bn[0] && bn[i] == 0)
1402         bn[i++] = BIGNUM_INT_MASK;
1403     bn[i]--;
1404 }
1405
1406 Bignum bignum_from_bytes(const unsigned char *data, int nbytes)
1407 {
1408     Bignum result;
1409     int w, i;
1410
1411     assert(nbytes >= 0 && nbytes < INT_MAX/8);
1412
1413     w = (nbytes + BIGNUM_INT_BYTES - 1) / BIGNUM_INT_BYTES; /* bytes->words */
1414
1415     result = newbn(w);
1416     for (i = 1; i <= w; i++)
1417         result[i] = 0;
1418     for (i = nbytes; i--;) {
1419         unsigned char byte = *data++;
1420         result[1 + i / BIGNUM_INT_BYTES] |=
1421             (BignumInt)byte << (8*i % BIGNUM_INT_BITS);
1422     }
1423
1424     bn_restore_invariant(result);
1425     return result;
1426 }
1427
1428 Bignum bignum_from_bytes_le(const unsigned char *data, int nbytes)
1429 {
1430     Bignum result;
1431     int w, i;
1432
1433     assert(nbytes >= 0 && nbytes < INT_MAX/8);
1434
1435     w = (nbytes + BIGNUM_INT_BYTES - 1) / BIGNUM_INT_BYTES; /* bytes->words */
1436
1437     result = newbn(w);
1438     for (i = 1; i <= w; i++)
1439         result[i] = 0;
1440     for (i = 0; i < nbytes; ++i) {
1441         unsigned char byte = *data++;
1442         result[1 + i / BIGNUM_INT_BYTES] |=
1443             (BignumInt)byte << (8*i % BIGNUM_INT_BITS);
1444     }
1445
1446     bn_restore_invariant(result);
1447     return result;
1448 }
1449
1450 Bignum bignum_from_decimal(const char *decimal)
1451 {
1452     Bignum result = copybn(Zero);
1453
1454     while (*decimal) {
1455         Bignum tmp, tmp2;
1456
1457         if (!isdigit((unsigned char)*decimal)) {
1458             freebn(result);
1459             return 0;
1460         }
1461
1462         tmp = bigmul(result, Ten);
1463         tmp2 = bignum_from_long(*decimal - '0');
1464         freebn(result);
1465         result = bigadd(tmp, tmp2);
1466         freebn(tmp);
1467         freebn(tmp2);
1468
1469         decimal++;
1470     }
1471
1472     return result;
1473 }
1474
1475 Bignum bignum_random_in_range(const Bignum lower, const Bignum upper)
1476 {
1477     Bignum ret = NULL;
1478     unsigned char *bytes;
1479     int upper_len = bignum_bitcount(upper);
1480     int upper_bytes = upper_len / 8;
1481     int upper_bits = upper_len % 8;
1482     if (upper_bits) ++upper_bytes;
1483
1484     bytes = snewn(upper_bytes, unsigned char);
1485     do {
1486         int i;
1487
1488         if (ret) freebn(ret);
1489
1490         for (i = 0; i < upper_bytes; ++i)
1491         {
1492             bytes[i] = (unsigned char)random_byte();
1493         }
1494         /* Mask the top to reduce failure rate to 50/50 */
1495         if (upper_bits)
1496         {
1497             bytes[i - 1] &= 0xFF >> (8 - upper_bits);
1498         }
1499
1500         ret = bignum_from_bytes(bytes, upper_bytes);
1501     } while (bignum_cmp(ret, lower) < 0 || bignum_cmp(ret, upper) > 0);
1502     smemclr(bytes, upper_bytes);
1503     sfree(bytes);
1504
1505     return ret;
1506 }
1507
1508 /*
1509  * Read an SSH-1-format bignum from a data buffer. Return the number
1510  * of bytes consumed, or -1 if there wasn't enough data.
1511  */
1512 int ssh1_read_bignum(const unsigned char *data, int len, Bignum * result)
1513 {
1514     const unsigned char *p = data;
1515     int i;
1516     int w, b;
1517
1518     if (len < 2)
1519         return -1;
1520
1521     w = 0;
1522     for (i = 0; i < 2; i++)
1523         w = (w << 8) + *p++;
1524     b = (w + 7) / 8;                   /* bits -> bytes */
1525
1526     if (len < b+2)
1527         return -1;
1528
1529     if (!result)                       /* just return length */
1530         return b + 2;
1531
1532     *result = bignum_from_bytes(p, b);
1533
1534     return p + b - data;
1535 }
1536
1537 /*
1538  * Return the bit count of a bignum, for SSH-1 encoding.
1539  */
1540 int bignum_bitcount(Bignum bn)
1541 {
1542     int bitcount = bn[0] * BIGNUM_INT_BITS - 1;
1543     while (bitcount >= 0
1544            && (bn[bitcount / BIGNUM_INT_BITS + 1] >> (bitcount % BIGNUM_INT_BITS)) == 0) bitcount--;
1545     return bitcount + 1;
1546 }
1547
1548 /*
1549  * Return the byte length of a bignum when SSH-1 encoded.
1550  */
1551 int ssh1_bignum_length(Bignum bn)
1552 {
1553     return 2 + (bignum_bitcount(bn) + 7) / 8;
1554 }
1555
1556 /*
1557  * Return the byte length of a bignum when SSH-2 encoded.
1558  */
1559 int ssh2_bignum_length(Bignum bn)
1560 {
1561     return 4 + (bignum_bitcount(bn) + 8) / 8;
1562 }
1563
1564 /*
1565  * Return a byte from a bignum; 0 is least significant, etc.
1566  */
1567 int bignum_byte(Bignum bn, int i)
1568 {
1569     if (i < 0 || i >= (int)(BIGNUM_INT_BYTES * bn[0]))
1570         return 0;                      /* beyond the end */
1571     else
1572         return (bn[i / BIGNUM_INT_BYTES + 1] >>
1573                 ((i % BIGNUM_INT_BYTES)*8)) & 0xFF;
1574 }
1575
1576 /*
1577  * Return a bit from a bignum; 0 is least significant, etc.
1578  */
1579 int bignum_bit(Bignum bn, int i)
1580 {
1581     if (i < 0 || i >= (int)(BIGNUM_INT_BITS * bn[0]))
1582         return 0;                      /* beyond the end */
1583     else
1584         return (bn[i / BIGNUM_INT_BITS + 1] >> (i % BIGNUM_INT_BITS)) & 1;
1585 }
1586
1587 /*
1588  * Set a bit in a bignum; 0 is least significant, etc.
1589  */
1590 void bignum_set_bit(Bignum bn, int bitnum, int value)
1591 {
1592     if (bitnum < 0 || bitnum >= (int)(BIGNUM_INT_BITS * bn[0])) {
1593         if (value) abort();                    /* beyond the end */
1594     } else {
1595         int v = bitnum / BIGNUM_INT_BITS + 1;
1596         BignumInt mask = (BignumInt)1 << (bitnum % BIGNUM_INT_BITS);
1597         if (value)
1598             bn[v] |= mask;
1599         else
1600             bn[v] &= ~mask;
1601     }
1602 }
1603
1604 /*
1605  * Write a SSH-1-format bignum into a buffer. It is assumed the
1606  * buffer is big enough. Returns the number of bytes used.
1607  */
1608 int ssh1_write_bignum(void *data, Bignum bn)
1609 {
1610     unsigned char *p = data;
1611     int len = ssh1_bignum_length(bn);
1612     int i;
1613     int bitc = bignum_bitcount(bn);
1614
1615     *p++ = (bitc >> 8) & 0xFF;
1616     *p++ = (bitc) & 0xFF;
1617     for (i = len - 2; i--;)
1618         *p++ = bignum_byte(bn, i);
1619     return len;
1620 }
1621
1622 /*
1623  * Compare two bignums. Returns like strcmp.
1624  */
1625 int bignum_cmp(Bignum a, Bignum b)
1626 {
1627     int amax = a[0], bmax = b[0];
1628     int i;
1629
1630     /* Annoyingly we have two representations of zero */
1631     if (amax == 1 && a[amax] == 0)
1632         amax = 0;
1633     if (bmax == 1 && b[bmax] == 0)
1634         bmax = 0;
1635
1636     assert(amax == 0 || a[amax] != 0);
1637     assert(bmax == 0 || b[bmax] != 0);
1638
1639     i = (amax > bmax ? amax : bmax);
1640     while (i) {
1641         BignumInt aval = (i > amax ? 0 : a[i]);
1642         BignumInt bval = (i > bmax ? 0 : b[i]);
1643         if (aval < bval)
1644             return -1;
1645         if (aval > bval)
1646             return +1;
1647         i--;
1648     }
1649     return 0;
1650 }
1651
1652 /*
1653  * Right-shift one bignum to form another.
1654  */
1655 Bignum bignum_rshift(Bignum a, int shift)
1656 {
1657     Bignum ret;
1658     int i, shiftw, shiftb, shiftbb, bits;
1659     BignumInt ai, ai1;
1660
1661     assert(shift >= 0);
1662
1663     bits = bignum_bitcount(a) - shift;
1664     ret = newbn((bits + BIGNUM_INT_BITS - 1) / BIGNUM_INT_BITS);
1665
1666     if (ret) {
1667         shiftw = shift / BIGNUM_INT_BITS;
1668         shiftb = shift % BIGNUM_INT_BITS;
1669         shiftbb = BIGNUM_INT_BITS - shiftb;
1670
1671         ai1 = a[shiftw + 1];
1672         for (i = 1; i <= (int)ret[0]; i++) {
1673             ai = ai1;
1674             ai1 = (i + shiftw + 1 <= (int)a[0] ? a[i + shiftw + 1] : 0);
1675             ret[i] = ((ai >> shiftb) | (ai1 << shiftbb)) & BIGNUM_INT_MASK;
1676         }
1677     }
1678
1679     return ret;
1680 }
1681
1682 /*
1683  * Left-shift one bignum to form another.
1684  */
1685 Bignum bignum_lshift(Bignum a, int shift)
1686 {
1687     Bignum ret;
1688     int bits, shiftWords, shiftBits;
1689
1690     assert(shift >= 0);
1691
1692     bits = bignum_bitcount(a) + shift;
1693     ret = newbn((bits + BIGNUM_INT_BITS - 1) / BIGNUM_INT_BITS);
1694
1695     shiftWords = shift / BIGNUM_INT_BITS;
1696     shiftBits = shift % BIGNUM_INT_BITS;
1697
1698     if (shiftBits == 0)
1699     {
1700         memcpy(&ret[1 + shiftWords], &a[1], sizeof(BignumInt) * a[0]);
1701     }
1702     else
1703     {
1704         int i;
1705         BignumInt carry = 0;
1706
1707         /* Remember that Bignum[0] is length, so add 1 */
1708         for (i = shiftWords + 1; i < ((int)a[0]) + shiftWords + 1; ++i)
1709         {
1710             BignumInt from = a[i - shiftWords];
1711             ret[i] = (from << shiftBits) | carry;
1712             carry = from >> (BIGNUM_INT_BITS - shiftBits);
1713         }
1714         if (carry) ret[i] = carry;
1715     }
1716
1717     return ret;
1718 }
1719
1720 /*
1721  * Non-modular multiplication and addition.
1722  */
1723 Bignum bigmuladd(Bignum a, Bignum b, Bignum addend)
1724 {
1725     int alen = a[0], blen = b[0];
1726     int mlen = (alen > blen ? alen : blen);
1727     int rlen, i, maxspot;
1728     int wslen;
1729     BignumInt *workspace;
1730     Bignum ret;
1731
1732     /* mlen space for a, mlen space for b, 2*mlen for result,
1733      * plus scratch space for multiplication */
1734     wslen = mlen * 4 + mul_compute_scratch(mlen);
1735     workspace = snewn(wslen, BignumInt);
1736     for (i = 0; i < mlen; i++) {
1737         workspace[0 * mlen + i] = (mlen - i <= (int)a[0] ? a[mlen - i] : 0);
1738         workspace[1 * mlen + i] = (mlen - i <= (int)b[0] ? b[mlen - i] : 0);
1739     }
1740
1741     internal_mul(workspace + 0 * mlen, workspace + 1 * mlen,
1742                  workspace + 2 * mlen, mlen, workspace + 4 * mlen);
1743
1744     /* now just copy the result back */
1745     rlen = alen + blen + 1;
1746     if (addend && rlen <= (int)addend[0])
1747         rlen = addend[0] + 1;
1748     ret = newbn(rlen);
1749     maxspot = 0;
1750     for (i = 1; i <= (int)ret[0]; i++) {
1751         ret[i] = (i <= 2 * mlen ? workspace[4 * mlen - i] : 0);
1752         if (ret[i] != 0)
1753             maxspot = i;
1754     }
1755     ret[0] = maxspot;
1756
1757     /* now add in the addend, if any */
1758     if (addend) {
1759         BignumCarry carry = 0;
1760         for (i = 1; i <= rlen; i++) {
1761             BignumInt retword = (i <= (int)ret[0] ? ret[i] : 0);
1762             BignumInt addword = (i <= (int)addend[0] ? addend[i] : 0);
1763             BignumADC(ret[i], carry, retword, addword, carry);
1764             if (ret[i] != 0 && i > maxspot)
1765                 maxspot = i;
1766         }
1767     }
1768     ret[0] = maxspot;
1769
1770     smemclr(workspace, wslen * sizeof(*workspace));
1771     sfree(workspace);
1772     return ret;
1773 }
1774
1775 /*
1776  * Non-modular multiplication.
1777  */
1778 Bignum bigmul(Bignum a, Bignum b)
1779 {
1780     return bigmuladd(a, b, NULL);
1781 }
1782
1783 /*
1784  * Simple addition.
1785  */
1786 Bignum bigadd(Bignum a, Bignum b)
1787 {
1788     int alen = a[0], blen = b[0];
1789     int rlen = (alen > blen ? alen : blen) + 1;
1790     int i, maxspot;
1791     Bignum ret;
1792     BignumCarry carry;
1793
1794     ret = newbn(rlen);
1795
1796     carry = 0;
1797     maxspot = 0;
1798     for (i = 1; i <= rlen; i++) {
1799         BignumInt aword = (i <= (int)a[0] ? a[i] : 0);
1800         BignumInt bword = (i <= (int)b[0] ? b[i] : 0);
1801         BignumADC(ret[i], carry, aword, bword, carry);
1802         if (ret[i] != 0 && i > maxspot)
1803             maxspot = i;
1804     }
1805     ret[0] = maxspot;
1806
1807     return ret;
1808 }
1809
1810 /*
1811  * Subtraction. Returns a-b, or NULL if the result would come out
1812  * negative (recall that this entire bignum module only handles
1813  * positive numbers).
1814  */
1815 Bignum bigsub(Bignum a, Bignum b)
1816 {
1817     int alen = a[0], blen = b[0];
1818     int rlen = (alen > blen ? alen : blen);
1819     int i, maxspot;
1820     Bignum ret;
1821     BignumCarry carry;
1822
1823     ret = newbn(rlen);
1824
1825     carry = 1;
1826     maxspot = 0;
1827     for (i = 1; i <= rlen; i++) {
1828         BignumInt aword = (i <= (int)a[0] ? a[i] : 0);
1829         BignumInt bword = (i <= (int)b[0] ? b[i] : 0);
1830         BignumADC(ret[i], carry, aword, ~bword, carry);
1831         if (ret[i] != 0 && i > maxspot)
1832             maxspot = i;
1833     }
1834     ret[0] = maxspot;
1835
1836     if (!carry) {
1837         freebn(ret);
1838         return NULL;
1839     }
1840
1841     return ret;
1842 }
1843
1844 /*
1845  * Create a bignum which is the bitmask covering another one. That
1846  * is, the smallest integer which is >= N and is also one less than
1847  * a power of two.
1848  */
1849 Bignum bignum_bitmask(Bignum n)
1850 {
1851     Bignum ret = copybn(n);
1852     int i;
1853     BignumInt j;
1854
1855     i = ret[0];
1856     while (n[i] == 0 && i > 0)
1857         i--;
1858     if (i <= 0)
1859         return ret;                    /* input was zero */
1860     j = 1;
1861     while (j < n[i])
1862         j = 2 * j + 1;
1863     ret[i] = j;
1864     while (--i > 0)
1865         ret[i] = BIGNUM_INT_MASK;
1866     return ret;
1867 }
1868
1869 /*
1870  * Convert an unsigned long into a bignum.
1871  */
1872 Bignum bignum_from_long(unsigned long n)
1873 {
1874     const int maxwords =
1875         (sizeof(unsigned long) + sizeof(BignumInt) - 1) / sizeof(BignumInt);
1876     Bignum ret;
1877     int i;
1878
1879     ret = newbn(maxwords);
1880     ret[0] = 0;
1881     for (i = 0; i < maxwords; i++) {
1882         ret[i+1] = n >> (i * BIGNUM_INT_BITS);
1883         if (ret[i+1] != 0)
1884             ret[0] = i+1;
1885     }
1886
1887     return ret;
1888 }
1889
1890 /*
1891  * Add a long to a bignum.
1892  */
1893 Bignum bignum_add_long(Bignum number, unsigned long n)
1894 {
1895     const int maxwords =
1896         (sizeof(unsigned long) + sizeof(BignumInt) - 1) / sizeof(BignumInt);
1897     Bignum ret;
1898     int words, i;
1899     BignumCarry carry;
1900
1901     words = number[0];
1902     if (words < maxwords)
1903         words = maxwords;
1904     words++;
1905     ret = newbn(words);
1906
1907     carry = 0;
1908     ret[0] = 0;
1909     for (i = 0; i < words; i++) {
1910         BignumInt nword = (i < maxwords ? n >> (i * BIGNUM_INT_BITS) : 0);
1911         BignumInt numword = (i < number[0] ? number[i+1] : 0);
1912         BignumADC(ret[i+1], carry, numword, nword, carry);
1913         if (ret[i+1] != 0)
1914             ret[0] = i+1;
1915     }
1916     return ret;
1917 }
1918
1919 /*
1920  * Compute the residue of a bignum, modulo a (max 16-bit) short.
1921  */
1922 unsigned short bignum_mod_short(Bignum number, unsigned short modulus)
1923 {
1924     unsigned long mod = modulus, r = 0;
1925     /* Precompute (BIGNUM_INT_MASK+1) % mod */
1926     unsigned long base_r = (BIGNUM_INT_MASK - modulus + 1) % mod;
1927     int i;
1928
1929     for (i = number[0]; i > 0; i--) {
1930         /*
1931          * Conceptually, ((r << BIGNUM_INT_BITS) + number[i]) % mod
1932          */
1933         r = ((r * base_r) + (number[i] % mod)) % mod;
1934     }
1935     return (unsigned short) r;
1936 }
1937
1938 #ifdef DEBUG
1939 void diagbn(char *prefix, Bignum md)
1940 {
1941     int i, nibbles, morenibbles;
1942     static const char hex[] = "0123456789ABCDEF";
1943
1944     debug(("%s0x", prefix ? prefix : ""));
1945
1946     nibbles = (3 + bignum_bitcount(md)) / 4;
1947     if (nibbles < 1)
1948         nibbles = 1;
1949     morenibbles = 4 * md[0] - nibbles;
1950     for (i = 0; i < morenibbles; i++)
1951         debug(("-"));
1952     for (i = nibbles; i--;)
1953         debug(("%c",
1954                hex[(bignum_byte(md, i / 2) >> (4 * (i % 2))) & 0xF]));
1955
1956     if (prefix)
1957         debug(("\n"));
1958 }
1959 #endif
1960
1961 /*
1962  * Simple division.
1963  */
1964 Bignum bigdiv(Bignum a, Bignum b)
1965 {
1966     Bignum q = newbn(a[0]);
1967     bigdivmod(a, b, NULL, q);
1968     while (q[0] > 1 && q[q[0]] == 0)
1969         q[0]--;
1970     return q;
1971 }
1972
1973 /*
1974  * Simple remainder.
1975  */
1976 Bignum bigmod(Bignum a, Bignum b)
1977 {
1978     Bignum r = newbn(b[0]);
1979     bigdivmod(a, b, r, NULL);
1980     while (r[0] > 1 && r[r[0]] == 0)
1981         r[0]--;
1982     return r;
1983 }
1984
1985 /*
1986  * Greatest common divisor.
1987  */
1988 Bignum biggcd(Bignum av, Bignum bv)
1989 {
1990     Bignum a = copybn(av);
1991     Bignum b = copybn(bv);
1992
1993     while (bignum_cmp(b, Zero) != 0) {
1994         Bignum t = newbn(b[0]);
1995         bigdivmod(a, b, t, NULL);
1996         while (t[0] > 1 && t[t[0]] == 0)
1997             t[0]--;
1998         freebn(a);
1999         a = b;
2000         b = t;
2001     }
2002
2003     freebn(b);
2004     return a;
2005 }
2006
2007 /*
2008  * Modular inverse, using Euclid's extended algorithm.
2009  */
2010 Bignum modinv(Bignum number, Bignum modulus)
2011 {
2012     Bignum a = copybn(modulus);
2013     Bignum b = copybn(number);
2014     Bignum xp = copybn(Zero);
2015     Bignum x = copybn(One);
2016     int sign = +1;
2017
2018     assert(number[number[0]] != 0);
2019     assert(modulus[modulus[0]] != 0);
2020
2021     while (bignum_cmp(b, One) != 0) {
2022         Bignum t, q;
2023
2024         if (bignum_cmp(b, Zero) == 0) {
2025             /*
2026              * Found a common factor between the inputs, so we cannot
2027              * return a modular inverse at all.
2028              */
2029             freebn(b);
2030             freebn(a);
2031             freebn(xp);
2032             freebn(x);
2033             return NULL;
2034         }
2035
2036         t = newbn(b[0]);
2037         q = newbn(a[0]);
2038         bigdivmod(a, b, t, q);
2039         while (t[0] > 1 && t[t[0]] == 0)
2040             t[0]--;
2041         while (q[0] > 1 && q[q[0]] == 0)
2042             q[0]--;
2043         freebn(a);
2044         a = b;
2045         b = t;
2046         t = xp;
2047         xp = x;
2048         x = bigmuladd(q, xp, t);
2049         sign = -sign;
2050         freebn(t);
2051         freebn(q);
2052     }
2053
2054     freebn(b);
2055     freebn(a);
2056     freebn(xp);
2057
2058     /* now we know that sign * x == 1, and that x < modulus */
2059     if (sign < 0) {
2060         /* set a new x to be modulus - x */
2061         Bignum newx = newbn(modulus[0]);
2062         BignumInt carry = 0;
2063         int maxspot = 1;
2064         int i;
2065
2066         for (i = 1; i <= (int)newx[0]; i++) {
2067             BignumInt aword = (i <= (int)modulus[0] ? modulus[i] : 0);
2068             BignumInt bword = (i <= (int)x[0] ? x[i] : 0);
2069             newx[i] = aword - bword - carry;
2070             bword = ~bword;
2071             carry = carry ? (newx[i] >= bword) : (newx[i] > bword);
2072             if (newx[i] != 0)
2073                 maxspot = i;
2074         }
2075         newx[0] = maxspot;
2076         freebn(x);
2077         x = newx;
2078     }
2079
2080     /* and return. */
2081     return x;
2082 }
2083
2084 /*
2085  * Render a bignum into decimal. Return a malloced string holding
2086  * the decimal representation.
2087  */
2088 char *bignum_decimal(Bignum x)
2089 {
2090     int ndigits, ndigit;
2091     int i, iszero;
2092     BignumInt carry;
2093     char *ret;
2094     BignumInt *workspace;
2095
2096     /*
2097      * First, estimate the number of digits. Since log(10)/log(2)
2098      * is just greater than 93/28 (the joys of continued fraction
2099      * approximations...) we know that for every 93 bits, we need
2100      * at most 28 digits. This will tell us how much to malloc.
2101      *
2102      * Formally: if x has i bits, that means x is strictly less
2103      * than 2^i. Since 2 is less than 10^(28/93), this is less than
2104      * 10^(28i/93). We need an integer power of ten, so we must
2105      * round up (rounding down might make it less than x again).
2106      * Therefore if we multiply the bit count by 28/93, rounding
2107      * up, we will have enough digits.
2108      *
2109      * i=0 (i.e., x=0) is an irritating special case.
2110      */
2111     i = bignum_bitcount(x);
2112     if (!i)
2113         ndigits = 1;                   /* x = 0 */
2114     else
2115         ndigits = (28 * i + 92) / 93;  /* multiply by 28/93 and round up */
2116     ndigits++;                         /* allow for trailing \0 */
2117     ret = snewn(ndigits, char);
2118
2119     /*
2120      * Now allocate some workspace to hold the binary form as we
2121      * repeatedly divide it by ten. Initialise this to the
2122      * big-endian form of the number.
2123      */
2124     workspace = snewn(x[0], BignumInt);
2125     for (i = 0; i < (int)x[0]; i++)
2126         workspace[i] = x[x[0] - i];
2127
2128     /*
2129      * Next, write the decimal number starting with the last digit.
2130      * We use ordinary short division, dividing 10 into the
2131      * workspace.
2132      */
2133     ndigit = ndigits - 1;
2134     ret[ndigit] = '\0';
2135     do {
2136         iszero = 1;
2137         carry = 0;
2138         for (i = 0; i < (int)x[0]; i++) {
2139             /*
2140              * Conceptually, we want to compute
2141              *
2142              *   (carry << BIGNUM_INT_BITS) + workspace[i]
2143              *   -----------------------------------------
2144              *                      10
2145              *
2146              * but we don't have an integer type longer than BignumInt
2147              * to work with. So we have to do it in pieces.
2148              */
2149
2150             BignumInt q, r;
2151             q = workspace[i] / 10;
2152             r = workspace[i] % 10;
2153
2154             /* I want (BIGNUM_INT_MASK+1)/10 but can't say so directly! */
2155             q += carry * ((BIGNUM_INT_MASK-9) / 10 + 1);
2156             r += carry * ((BIGNUM_INT_MASK-9) % 10);
2157
2158             q += r / 10;
2159             r %= 10;
2160
2161             workspace[i] = q;
2162             carry = r;
2163
2164             if (workspace[i])
2165                 iszero = 0;
2166         }
2167         ret[--ndigit] = (char) (carry + '0');
2168     } while (!iszero);
2169
2170     /*
2171      * There's a chance we've fallen short of the start of the
2172      * string. Correct if so.
2173      */
2174     if (ndigit > 0)
2175         memmove(ret, ret + ndigit, ndigits - ndigit);
2176
2177     /*
2178      * Done.
2179      */
2180     smemclr(workspace, x[0] * sizeof(*workspace));
2181     sfree(workspace);
2182     return ret;
2183 }