+/*
+ * Montgomery reduction. Expects x to be a big-endian array of 2*len
+ * BignumInts whose value satisfies 0 <= x < rn (where r = 2^(len *
+ * BIGNUM_INT_BITS) is the Montgomery base). Returns in the same array
+ * a value x' which is congruent to xr^{-1} mod n, and satisfies 0 <=
+ * x' < n.
+ *
+ * 'n' and 'mninv' should be big-endian arrays of 'len' BignumInts
+ * each, containing respectively n and the multiplicative inverse of
+ * -n mod r.
+ *
+ * 'tmp' is an array of BignumInt used as scratch space, of length at
+ * least 3*len + mul_compute_scratch(len).
+ */
+static void monty_reduce(BignumInt *x, const BignumInt *n,
+ const BignumInt *mninv, BignumInt *tmp, int len)
+{
+ int i;
+ BignumInt carry;
+
+ /*
+ * Multiply x by (-n)^{-1} mod r. This gives us a value m such
+ * that mn is congruent to -x mod r. Hence, mn+x is an exact
+ * multiple of r, and is also (obviously) congruent to x mod n.
+ */
+ internal_mul_low(x + len, mninv, tmp, len, tmp + 3*len);
+
+ /*
+ * Compute t = (mn+x)/r in ordinary, non-modular, integer
+ * arithmetic. By construction this is exact, and is congruent mod
+ * n to x * r^{-1}, i.e. the answer we want.
+ *
+ * The following multiply leaves that answer in the _most_
+ * significant half of the 'x' array, so then we must shift it
+ * down.
+ */
+ internal_mul(tmp, n, tmp+len, len, tmp + 3*len);
+ carry = internal_add(x, tmp+len, x, 2*len);
+ for (i = 0; i < len; i++)
+ x[len + i] = x[i], x[i] = 0;
+
+ /*
+ * Reduce t mod n. This doesn't require a full-on division by n,
+ * but merely a test and single optional subtraction, since we can
+ * show that 0 <= t < 2n.
+ *
+ * Proof:
+ * + we computed m mod r, so 0 <= m < r.
+ * + so 0 <= mn < rn, obviously
+ * + hence we only need 0 <= x < rn to guarantee that 0 <= mn+x < 2rn
+ * + yielding 0 <= (mn+x)/r < 2n as required.
+ */
+ if (!carry) {
+ for (i = 0; i < len; i++)
+ if (x[len + i] != n[i])
+ break;
+ }
+ if (carry || i >= len || x[len + i] > n[i])
+ internal_sub(x+len, n, x+len, len);
+}
+
+static void internal_add_shifted(BignumInt *number,
+ BignumInt n, int shift)
+{
+ int word = 1 + (shift / BIGNUM_INT_BITS);
+ int bshift = shift % BIGNUM_INT_BITS;
+ BignumInt addendh, addendl;
+ BignumCarry carry;
+
+ addendl = n << bshift;
+ addendh = (bshift == 0 ? 0 : n >> (BIGNUM_INT_BITS - bshift));
+
+ assert(word <= number[0]);
+ BignumADC(number[word], carry, number[word], addendl, 0);
+ word++;
+ if (!addendh && !carry)
+ return;
+ assert(word <= number[0]);
+ BignumADC(number[word], carry, number[word], addendh, carry);
+ word++;
+ while (carry) {
+ assert(word <= number[0]);
+ BignumADC(number[word], carry, number[word], 0, carry);
+ word++;
+ }
+}
+
+static int bn_clz(BignumInt x)
+{
+ /*
+ * Count the leading zero bits in x. Equivalently, how far left
+ * would we need to shift x to make its top bit set?
+ *
+ * Precondition: x != 0.
+ */
+
+ /* FIXME: would be nice to put in some compiler intrinsics under
+ * ifdef here */
+ int i, ret = 0;
+ for (i = BIGNUM_INT_BITS / 2; i != 0; i >>= 1) {
+ if ((x >> (BIGNUM_INT_BITS-i)) == 0) {
+ x <<= i;
+ ret += i;
+ }
+ }
+ return ret;
+}
+
+static BignumInt reciprocal_word(BignumInt d)
+{
+ BignumInt dshort, recip, prodh, prodl;
+ int corrections;
+
+ /*
+ * Input: a BignumInt value d, with its top bit set.
+ */
+ assert(d >> (BIGNUM_INT_BITS-1) == 1);
+
+ /*
+ * Output: a value, shifted to fill a BignumInt, which is strictly
+ * less than 1/(d+1), i.e. is an *under*-estimate (but by as
+ * little as possible within the constraints) of the reciprocal of
+ * any number whose first BIGNUM_INT_BITS bits match d.
+ *
+ * Ideally we'd like to _totally_ fill BignumInt, i.e. always
+ * return a value with the top bit set. Unfortunately we can't
+ * quite guarantee that for all inputs and also return a fixed
+ * exponent. So instead we take our reciprocal to be
+ * 2^(BIGNUM_INT_BITS*2-1) / d, so that it has the top bit clear
+ * only in the exceptional case where d takes exactly the maximum
+ * value BIGNUM_INT_MASK; in that case, the top bit is clear and
+ * the next bit down is set.
+ */
+
+ /*
+ * Start by computing a half-length version of the answer, by
+ * straightforward division within a BignumInt.
+ */
+ dshort = (d >> (BIGNUM_INT_BITS/2)) + 1;
+ recip = (BIGNUM_TOP_BIT + dshort - 1) / dshort;
+ recip <<= BIGNUM_INT_BITS - BIGNUM_INT_BITS/2;
+
+ /*
+ * Newton-Raphson iteration to improve that starting reciprocal
+ * estimate: take f(x) = d - 1/x, and then the N-R formula gives
+ * x_new = x - f(x)/f'(x) = x - (d-1/x)/(1/x^2) = x(2-d*x). Or,
+ * taking our fixed-point representation into account, take f(x)
+ * to be d - K/x (where K = 2^(BIGNUM_INT_BITS*2-1) as discussed
+ * above) and then we get (2K - d*x) * x/K.
+ *
+ * Newton-Raphson doubles the number of correct bits at every
+ * iteration, and the initial division above already gave us half
+ * the output word, so it's only worth doing one iteration.
+ */
+ BignumMULADD(prodh, prodl, recip, d, recip);
+ prodl = ~prodl;
+ prodh = ~prodh;
+ {
+ BignumCarry c;
+ BignumADC(prodl, c, prodl, 1, 0);
+ prodh += c;
+ }
+ BignumMUL(prodh, prodl, prodh, recip);
+ recip = (prodh << 1) | (prodl >> (BIGNUM_INT_BITS-1));
+
+ /*
+ * Now make sure we have the best possible reciprocal estimate,
+ * before we return it. We might have been off by a handful either
+ * way - not enough to bother with any better-thought-out kind of
+ * correction loop.
+ */
+ BignumMULADD(prodh, prodl, recip, d, recip);
+ corrections = 0;
+ if (prodh >= BIGNUM_TOP_BIT) {
+ do {
+ BignumCarry c = 1;
+ BignumADC(prodl, c, prodl, ~d, c); prodh += BIGNUM_INT_MASK + c;
+ recip--;
+ corrections++;
+ } while (prodh >= ((BignumInt)1 << (BIGNUM_INT_BITS-1)));
+ } else {
+ while (1) {
+ BignumInt newprodh, newprodl;
+ BignumCarry c = 0;
+ BignumADC(newprodl, c, prodl, d, c); newprodh = prodh + c;
+ if (newprodh >= BIGNUM_TOP_BIT)
+ break;
+ prodh = newprodh;
+ prodl = newprodl;
+ recip++;
+ corrections++;
+ }
+ }
+
+ return recip;
+}
+