+ {
+ BignumInt unused;
+ BignumMUL(q, unused, recip, aword);
+ (void)unused;
+ }
+
+#ifdef DIVISION_DEBUG
+ printf("i=%d, aword=%#0*llx, shift=%d, q=%#0*llx\n",
+ i, BIGNUM_INT_BITS/4, (unsigned long long)aword,
+ shift, BIGNUM_INT_BITS/4, (unsigned long long)q);
+#endif
+
+ /*
+ * Work out the right bit and word offsets to use when
+ * subtracting q*m from a.
+ *
+ * aword was taken from a[i], which means its LSB was at bit
+ * position (alen-1-i) * BIGNUM_INT_BITS. But then we shifted
+ * it left by 'shift', so now the low bit of aword corresponds
+ * to bit position (alen-1-i) * BIGNUM_INT_BITS - shift, i.e.
+ * aword is approximately equal to a / 2^(that).
+ *
+ * m0 comes from the top word of mod, so its LSB is at bit
+ * position (mlen-1) * BIGNUM_INT_BITS - rshift, i.e. it can
+ * be considered to be m / 2^(that power). 'recip' is the
+ * reciprocal of m0, times 2^(BIGNUM_INT_BITS*2-1), i.e. it's
+ * about 2^((mlen+1) * BIGNUM_INT_BITS - rshift - 1) / m.
+ *
+ * Hence, recip * aword is approximately equal to the product
+ * of those, which simplifies to
+ *
+ * a/m * 2^((mlen+2+i-alen)*BIGNUM_INT_BITS + shift - rshift - 1)
+ *
+ * But we've also shifted recip*aword down by BIGNUM_INT_BITS
+ * to form q, so we have
+ *
+ * q ~= a/m * 2^((mlen+1+i-alen)*BIGNUM_INT_BITS + shift - rshift - 1)
+ *
+ * and hence, when we now compute q*m, it will be about
+ * a*2^(all that lot), i.e. the negation of that expression is
+ * how far left we have to shift the product q*m to make it
+ * approximately equal to a.
+ */
+ full_bitoffset = -((mlen+1+i-alen)*BIGNUM_INT_BITS + shift-rshift-1);
+#ifdef DIVISION_DEBUG
+ printf("full_bitoffset=%d\n", full_bitoffset);
+#endif
+
+ if (full_bitoffset < 0) {
+ /*
+ * If we find ourselves needing to shift q*m _right_, that
+ * means we've reached the bottom of the quotient. Clip q
+ * so that its right shift becomes zero, and if that means
+ * q becomes _actually_ zero, this loop is done.
+ */
+ if (full_bitoffset <= -BIGNUM_INT_BITS)
+ break;
+ q >>= -full_bitoffset;
+ full_bitoffset = 0;
+ if (!q)
+ break;
+#ifdef DIVISION_DEBUG
+ printf("now full_bitoffset=%d, q=%#0*llx\n",
+ full_bitoffset, BIGNUM_INT_BITS/4, (unsigned long long)q);
+#endif
+ }
+
+ wordoffset = full_bitoffset / BIGNUM_INT_BITS;
+ bitoffset = full_bitoffset % BIGNUM_INT_BITS;
+#ifdef DIVISION_DEBUG
+ printf("wordoffset=%d, bitoffset=%d\n", wordoffset, bitoffset);
+#endif
+
+ /* wordoffset as computed above is the offset between the LSWs
+ * of m and a. But in fact m and a are stored MSW-first, so we
+ * need to adjust it to be the offset between the actual array
+ * indices, and flip the sign too. */
+ wordoffset = alen - mlen - wordoffset;
+
+ if (bitoffset == 0) {
+ BignumCarry c = 1;
+ BignumInt prev_hi_word = 0;
+ for (k = mlen - 1; wordoffset+k >= i; k--) {
+ BignumInt mword = k<0 ? 0 : m[k];
+ BignumMULADD(prev_hi_word, product, q, mword, prev_hi_word);
+#ifdef DIVISION_DEBUG
+ printf(" aligned sub: product word for m[%d] = %#0*llx\n",
+ k, BIGNUM_INT_BITS/4,
+ (unsigned long long)product);
+#endif
+#ifdef DIVISION_DEBUG
+ printf(" aligned sub: subtrahend for a[%d] = %#0*llx\n",
+ wordoffset+k, BIGNUM_INT_BITS/4,
+ (unsigned long long)product);
+#endif
+ BignumADC(a[wordoffset+k], c, a[wordoffset+k], ~product, c);
+ }
+ } else {
+ BignumInt add_word = 0;
+ BignumInt c = 1;
+ BignumInt prev_hi_word = 0;
+ for (k = mlen - 1; wordoffset+k >= i; k--) {
+ BignumInt mword = k<0 ? 0 : m[k];
+ BignumMULADD(prev_hi_word, product, q, mword, prev_hi_word);
+#ifdef DIVISION_DEBUG
+ printf(" unaligned sub: product word for m[%d] = %#0*llx\n",
+ k, BIGNUM_INT_BITS/4,
+ (unsigned long long)product);
+#endif
+
+ add_word |= product << bitoffset;
+
+#ifdef DIVISION_DEBUG
+ printf(" unaligned sub: subtrahend for a[%d] = %#0*llx\n",
+ wordoffset+k,
+ BIGNUM_INT_BITS/4, (unsigned long long)add_word);
+#endif
+ BignumADC(a[wordoffset+k], c, a[wordoffset+k], ~add_word, c);
+
+ add_word = product >> (BIGNUM_INT_BITS - bitoffset);
+ }
+ }
+
+ if (quot) {
+#ifdef DIVISION_DEBUG
+ printf("adding quotient word %#0*llx << %d\n",
+ BIGNUM_INT_BITS/4, (unsigned long long)q, full_bitoffset);
+#endif
+ internal_add_shifted(quot, q, full_bitoffset);
+#ifdef DIVISION_DEBUG
+ {
+ int d;
+ printf("now quot=0x");
+ for (d = quot[0]; d > 0; d--)
+ printf("%0*llx", BIGNUM_INT_BITS/4,
+ (unsigned long long)quot[d]);
+ printf("\n");
+ }
+#endif
+ }
+ }
+
+#ifdef DIVISION_DEBUG
+ {
+ int d;
+ printf("end main loop, a=0x");
+ for (d = 0; d < alen; d++)
+ printf("%0*llx", BIGNUM_INT_BITS/4, (unsigned long long)a[d]);
+ if (quot) {
+ printf(", quot=0x");
+ for (d = quot[0]; d > 0; d--)
+ printf("%0*llx", BIGNUM_INT_BITS/4,
+ (unsigned long long)quot[d]);
+ }
+ printf("\n");
+ }
+#endif
+
+ /*
+ * The above loop should terminate with the remaining value in a
+ * being strictly less than 2*m (if a >= 2*m then we should always
+ * have managed to get a nonzero q word), but we can't guarantee
+ * that it will be strictly less than m: consider a case where the
+ * remainder is 1, and another where the remainder is m-1. By the
+ * time a contains a value that's _about m_, you clearly can't
+ * distinguish those cases by looking at only the top word of a -
+ * you have to go all the way down to the bottom before you find
+ * out whether it's just less or just more than m.
+ *
+ * Hence, we now do a final fixup in which we subtract one last
+ * copy of m, or don't, accordingly. We should never have to
+ * subtract more than one copy of m here.
+ */
+ for (i = 0; i < alen; i++) {
+ /* Compare a with m, word by word, from the MSW down. As soon
+ * as we encounter a difference, we know whether we need the
+ * fixup. */
+ int mindex = mlen-alen+i;
+ BignumInt mword = mindex < 0 ? 0 : m[mindex];
+ if (a[i] < mword) {
+#ifdef DIVISION_DEBUG
+ printf("final fixup not needed, a < m\n");
+#endif
+ return;
+ } else if (a[i] > mword) {
+#ifdef DIVISION_DEBUG
+ printf("final fixup is needed, a > m\n");
+#endif
+ break;
+ }
+ /* If neither of those cases happened, the words are the same,
+ * so keep going and look at the next one. */
+ }
+#ifdef DIVISION_DEBUG
+ if (i == mlen) /* if we printed neither of the above diagnostics */
+ printf("final fixup is needed, a == m\n");
+#endif
+
+ /*
+ * If we got here without returning, then a >= m, so we must
+ * subtract m, and increment the quotient.
+ */
+ {
+ BignumCarry c = 1;
+ for (i = alen - 1; i >= 0; i--) {
+ int mindex = mlen-alen+i;
+ BignumInt mword = mindex < 0 ? 0 : m[mindex];
+ BignumADC(a[i], c, a[i], ~mword, c);
+ }
+ }
+ if (quot)
+ internal_add_shifted(quot, 1, 0);
+
+#ifdef DIVISION_DEBUG
+ {
+ int d;
+ printf("after final fixup, a=0x");
+ for (d = 0; d < alen; d++)
+ printf("%0*llx", BIGNUM_INT_BITS/4, (unsigned long long)a[d]);
+ if (quot) {
+ printf(", quot=0x");
+ for (d = quot[0]; d > 0; d--)
+ printf("%0*llx", BIGNUM_INT_BITS/4,
+ (unsigned long long)quot[d]);
+ }
+ printf("\n");