2 * ChaCha20-Poly1305 Implementation for SSH-2
5 * http://cvsweb.openbsd.org/cgi-bin/cvsweb/src/usr.bin/ssh/PROTOCOL.chacha20poly1305?rev=1.2&content-type=text/x-cvsweb-markup
8 * http://cr.yp.to/chacha/chacha-20080128.pdf
11 * http://cr.yp.to/snuffle/spec.pdf
14 * http://cr.yp.to/mac/poly1305-20050329.pdf
16 * The nonce for the Poly1305 is the second part of the key output
17 * from the first round of ChaCha20. This removes the AES requirement.
18 * This is undocumented!
20 * This has an intricate link between the cipher and the MAC. The
21 * keying of both is done in by the cipher and setting of the IV is
22 * done by the MAC. One cannot operate without the other. The
23 * configuration of the ssh2_cipher structure ensures that the MAC is
24 * set (and others ignored) if this cipher is chosen.
26 * This cipher also encrypts the length using a different
27 * instantiation of the cipher using a different key and IV made from
28 * the sequence number which is passed in addition when calling
29 * encrypt/decrypt on it.
39 /* ChaCha20 implementation, only supporting 256-bit keys */
41 /* State for each ChaCha20 instance */
43 /* Current context, usually with the count incremented
44 * 0-3 are the static constant
46 * 12-13 are the counter
49 /* The output of the state above ready to xor */
50 unsigned char current[64];
51 /* The index of the above currently used to allow a true streaming cipher */
55 static INLINE void chacha20_round(struct chacha20 *ctx)
61 memcpy(copy, ctx->state, sizeof(copy));
63 /* A circular rotation for a 32bit number */
64 #define rotl(x, shift) x = ((x << shift) | (x >> (32 - shift)))
66 /* What to do for each quarter round operation */
67 #define qrop(a, b, c, d) \
73 #define quarter(a, b, c, d) \
79 /* Do 20 rounds, in pairs because every other is different */
80 for (i = 0; i < 20; i += 2) {
84 quarter(2, 6, 10, 14);
85 quarter(3, 7, 11, 15);
86 /* Another slightly different round */
87 quarter(0, 5, 10, 15);
88 quarter(1, 6, 11, 12);
93 /* Dump the macros, don't need them littering */
98 /* Add the initial state */
99 for (i = 0; i < 16; ++i) {
100 copy[i] += ctx->state[i];
103 /* Update the content of the xor buffer */
104 for (i = 0; i < 16; ++i) {
105 ctx->current[i * 4 + 0] = copy[i] >> 0;
106 ctx->current[i * 4 + 1] = copy[i] >> 8;
107 ctx->current[i * 4 + 2] = copy[i] >> 16;
108 ctx->current[i * 4 + 3] = copy[i] >> 24;
110 /* State full, reset pointer to beginning */
111 ctx->currentIndex = 0;
112 smemclr(copy, sizeof(copy));
114 /* Increment round counter */
116 /* Check for overflow, not done in one line so the 32 bits are chopped by the type */
117 if (!(uint32)(ctx->state[12])) {
122 /* Initialise context with 256bit key */
123 static void chacha20_key(struct chacha20 *ctx, const unsigned char *key)
125 static const char constant[16] = "expand 32-byte k";
127 /* Add the fixed string to the start of the state */
128 ctx->state[0] = GET_32BIT_LSB_FIRST(constant + 0);
129 ctx->state[1] = GET_32BIT_LSB_FIRST(constant + 4);
130 ctx->state[2] = GET_32BIT_LSB_FIRST(constant + 8);
131 ctx->state[3] = GET_32BIT_LSB_FIRST(constant + 12);
134 ctx->state[4] = GET_32BIT_LSB_FIRST(key + 0);
135 ctx->state[5] = GET_32BIT_LSB_FIRST(key + 4);
136 ctx->state[6] = GET_32BIT_LSB_FIRST(key + 8);
137 ctx->state[7] = GET_32BIT_LSB_FIRST(key + 12);
138 ctx->state[8] = GET_32BIT_LSB_FIRST(key + 16);
139 ctx->state[9] = GET_32BIT_LSB_FIRST(key + 20);
140 ctx->state[10] = GET_32BIT_LSB_FIRST(key + 24);
141 ctx->state[11] = GET_32BIT_LSB_FIRST(key + 28);
143 /* New key, dump context */
144 ctx->currentIndex = 64;
147 static void chacha20_iv(struct chacha20 *ctx, const unsigned char *iv)
151 ctx->state[14] = GET_32BIT_MSB_FIRST(iv);
152 ctx->state[15] = GET_32BIT_MSB_FIRST(iv + 4);
154 /* New IV, dump context */
155 ctx->currentIndex = 64;
158 static void chacha20_encrypt(struct chacha20 *ctx, unsigned char *blk, int len)
161 /* If we don't have any state left, then cycle to the next */
162 if (ctx->currentIndex >= 64) {
166 /* Do the xor while there's some state left and some plaintext left */
167 while (ctx->currentIndex < 64 && len) {
168 *blk++ ^= ctx->current[ctx->currentIndex++];
174 /* Decrypt is encrypt... It's xor against a PRNG... */
175 static INLINE void chacha20_decrypt(struct chacha20 *ctx,
176 unsigned char *blk, int len)
178 chacha20_encrypt(ctx, blk, len);
181 /* Poly1305 implementation (no AES, nonce is not encrypted) */
183 #define NWORDS ((130 + BIGNUM_INT_BITS-1) / BIGNUM_INT_BITS)
184 typedef struct bigval {
188 static void bigval_clear(bigval *r)
191 for (i = 0; i < NWORDS; i++)
195 static void bigval_import_le(bigval *r, const void *vdata, int len)
197 const unsigned char *data = (const unsigned char *)vdata;
200 for (i = 0; i < len; i++)
201 r->w[i / BIGNUM_INT_BYTES] |=
202 (BignumInt)data[i] << (8 * (i % BIGNUM_INT_BYTES));
205 static void bigval_export_le(const bigval *r, void *vdata, int len)
207 unsigned char *data = (unsigned char *)vdata;
209 for (i = 0; i < len; i++)
210 data[i] = r->w[i / BIGNUM_INT_BYTES] >> (8 * (i % BIGNUM_INT_BYTES));
214 * Addition of bigvals, not mod p.
216 static void bigval_add(bigval *r, const bigval *a, const bigval *b)
218 #if BIGNUM_INT_BITS == 64
219 /* ./contrib/make1305.py add 64 */
234 #elif BIGNUM_INT_BITS == 32
235 /* ./contrib/make1305.py add 32 */
258 #elif BIGNUM_INT_BITS == 16
259 /* ./contrib/make1305.py add 16 */
299 #error Run contrib/make1305.py again with a different bit count
304 * Multiplication of bigvals mod p. Uses r as temporary storage, so
305 * don't pass r aliasing a or b.
307 static void bigval_mul_mod_p(bigval *r, const bigval *a, const bigval *b)
309 #if BIGNUM_INT_BITS == 64
310 /* ./contrib/make1305.py mul 64 */
317 tmp = (BignumDblInt)(a->w[0]) * (b->w[0]);
318 acclo += tmp & BIGNUM_INT_MASK;
321 acclo = acchi + (acclo >> 64);
323 tmp = (BignumDblInt)(a->w[0]) * (b->w[1]);
324 acclo += tmp & BIGNUM_INT_MASK;
326 tmp = (BignumDblInt)(a->w[1]) * (b->w[0]);
327 acclo += tmp & BIGNUM_INT_MASK;
330 acclo = acchi + (acclo >> 64);
332 tmp = (BignumDblInt)(a->w[0]) * (b->w[2]);
333 acclo += tmp & BIGNUM_INT_MASK;
335 tmp = (BignumDblInt)(a->w[1]) * (b->w[1]);
336 acclo += tmp & BIGNUM_INT_MASK;
338 tmp = (BignumDblInt)(a->w[2]) * (b->w[0]);
339 acclo += tmp & BIGNUM_INT_MASK;
341 r->w[2] = acclo & (((BignumInt)1 << 2)-1);
343 acc2lo += ((acclo >> 2) & (((BignumInt)1 << 62)-1)) * ((BignumDblInt)5 << 0);
344 acclo = acchi + (acclo >> 64);
346 tmp = (BignumDblInt)(a->w[1]) * (b->w[2]);
347 acclo += tmp & BIGNUM_INT_MASK;
349 tmp = (BignumDblInt)(a->w[2]) * (b->w[1]);
350 acclo += tmp & BIGNUM_INT_MASK;
352 acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 62);
356 acc2lo += ((acclo >> 2) & (((BignumInt)1 << 62)-1)) * ((BignumDblInt)5 << 0);
357 acclo = acchi + (acclo >> 64);
359 tmp = (BignumDblInt)(a->w[2]) * (b->w[2]);
360 acclo += tmp & BIGNUM_INT_MASK;
362 acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 62);
366 acc2lo += ((acclo >> 2) & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 0);
370 acc2lo += ((acclo >> 4) & (((BignumInt)1 << 60)-1)) * ((BignumDblInt)25 << 0);
371 acclo = acchi + (acclo >> 64);
373 acc2lo += (acclo & (((BignumInt)1 << 4)-1)) * ((BignumDblInt)25 << 60);
377 acc2lo += ((acclo >> 4) & (((BignumInt)1 << 60)-1)) * ((BignumDblInt)25 << 0);
378 acclo = acchi + (acclo >> 64);
386 #elif BIGNUM_INT_BITS == 32
387 /* ./contrib/make1305.py mul 32 */
394 tmp = (BignumDblInt)(a->w[0]) * (b->w[0]);
395 acclo += tmp & BIGNUM_INT_MASK;
398 acclo = acchi + (acclo >> 32);
400 tmp = (BignumDblInt)(a->w[0]) * (b->w[1]);
401 acclo += tmp & BIGNUM_INT_MASK;
403 tmp = (BignumDblInt)(a->w[1]) * (b->w[0]);
404 acclo += tmp & BIGNUM_INT_MASK;
407 acclo = acchi + (acclo >> 32);
409 tmp = (BignumDblInt)(a->w[0]) * (b->w[2]);
410 acclo += tmp & BIGNUM_INT_MASK;
412 tmp = (BignumDblInt)(a->w[1]) * (b->w[1]);
413 acclo += tmp & BIGNUM_INT_MASK;
415 tmp = (BignumDblInt)(a->w[2]) * (b->w[0]);
416 acclo += tmp & BIGNUM_INT_MASK;
419 acclo = acchi + (acclo >> 32);
421 tmp = (BignumDblInt)(a->w[0]) * (b->w[3]);
422 acclo += tmp & BIGNUM_INT_MASK;
424 tmp = (BignumDblInt)(a->w[1]) * (b->w[2]);
425 acclo += tmp & BIGNUM_INT_MASK;
427 tmp = (BignumDblInt)(a->w[2]) * (b->w[1]);
428 acclo += tmp & BIGNUM_INT_MASK;
430 tmp = (BignumDblInt)(a->w[3]) * (b->w[0]);
431 acclo += tmp & BIGNUM_INT_MASK;
434 acclo = acchi + (acclo >> 32);
436 tmp = (BignumDblInt)(a->w[0]) * (b->w[4]);
437 acclo += tmp & BIGNUM_INT_MASK;
439 tmp = (BignumDblInt)(a->w[1]) * (b->w[3]);
440 acclo += tmp & BIGNUM_INT_MASK;
442 tmp = (BignumDblInt)(a->w[2]) * (b->w[2]);
443 acclo += tmp & BIGNUM_INT_MASK;
445 tmp = (BignumDblInt)(a->w[3]) * (b->w[1]);
446 acclo += tmp & BIGNUM_INT_MASK;
448 tmp = (BignumDblInt)(a->w[4]) * (b->w[0]);
449 acclo += tmp & BIGNUM_INT_MASK;
451 r->w[4] = acclo & (((BignumInt)1 << 2)-1);
453 acc2lo += ((acclo >> 2) & (((BignumInt)1 << 30)-1)) * ((BignumDblInt)5 << 0);
454 acclo = acchi + (acclo >> 32);
456 tmp = (BignumDblInt)(a->w[1]) * (b->w[4]);
457 acclo += tmp & BIGNUM_INT_MASK;
459 tmp = (BignumDblInt)(a->w[2]) * (b->w[3]);
460 acclo += tmp & BIGNUM_INT_MASK;
462 tmp = (BignumDblInt)(a->w[3]) * (b->w[2]);
463 acclo += tmp & BIGNUM_INT_MASK;
465 tmp = (BignumDblInt)(a->w[4]) * (b->w[1]);
466 acclo += tmp & BIGNUM_INT_MASK;
468 acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 30);
472 acc2lo += ((acclo >> 2) & (((BignumInt)1 << 30)-1)) * ((BignumDblInt)5 << 0);
473 acclo = acchi + (acclo >> 32);
475 tmp = (BignumDblInt)(a->w[2]) * (b->w[4]);
476 acclo += tmp & BIGNUM_INT_MASK;
478 tmp = (BignumDblInt)(a->w[3]) * (b->w[3]);
479 acclo += tmp & BIGNUM_INT_MASK;
481 tmp = (BignumDblInt)(a->w[4]) * (b->w[2]);
482 acclo += tmp & BIGNUM_INT_MASK;
484 acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 30);
488 acc2lo += ((acclo >> 2) & (((BignumInt)1 << 30)-1)) * ((BignumDblInt)5 << 0);
489 acclo = acchi + (acclo >> 32);
491 tmp = (BignumDblInt)(a->w[3]) * (b->w[4]);
492 acclo += tmp & BIGNUM_INT_MASK;
494 tmp = (BignumDblInt)(a->w[4]) * (b->w[3]);
495 acclo += tmp & BIGNUM_INT_MASK;
497 acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 30);
501 acc2lo += ((acclo >> 2) & (((BignumInt)1 << 30)-1)) * ((BignumDblInt)5 << 0);
502 acclo = acchi + (acclo >> 32);
504 tmp = (BignumDblInt)(a->w[4]) * (b->w[4]);
505 acclo += tmp & BIGNUM_INT_MASK;
507 acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 30);
511 acc2lo += ((acclo >> 2) & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 0);
515 acc2lo += ((acclo >> 4) & (((BignumInt)1 << 28)-1)) * ((BignumDblInt)25 << 0);
516 acclo = acchi + (acclo >> 32);
518 acc2lo += (acclo & (((BignumInt)1 << 4)-1)) * ((BignumDblInt)25 << 28);
522 acc2lo += ((acclo >> 4) & (((BignumInt)1 << 28)-1)) * ((BignumDblInt)25 << 0);
523 acclo = acchi + (acclo >> 32);
537 #elif BIGNUM_INT_BITS == 16
538 /* ./contrib/make1305.py mul 16 */
545 tmp = (BignumDblInt)(a->w[0]) * (b->w[0]);
546 acclo += tmp & BIGNUM_INT_MASK;
549 acclo = acchi + (acclo >> 16);
551 tmp = (BignumDblInt)(a->w[0]) * (b->w[1]);
552 acclo += tmp & BIGNUM_INT_MASK;
554 tmp = (BignumDblInt)(a->w[1]) * (b->w[0]);
555 acclo += tmp & BIGNUM_INT_MASK;
558 acclo = acchi + (acclo >> 16);
560 tmp = (BignumDblInt)(a->w[0]) * (b->w[2]);
561 acclo += tmp & BIGNUM_INT_MASK;
563 tmp = (BignumDblInt)(a->w[1]) * (b->w[1]);
564 acclo += tmp & BIGNUM_INT_MASK;
566 tmp = (BignumDblInt)(a->w[2]) * (b->w[0]);
567 acclo += tmp & BIGNUM_INT_MASK;
570 acclo = acchi + (acclo >> 16);
572 tmp = (BignumDblInt)(a->w[0]) * (b->w[3]);
573 acclo += tmp & BIGNUM_INT_MASK;
575 tmp = (BignumDblInt)(a->w[1]) * (b->w[2]);
576 acclo += tmp & BIGNUM_INT_MASK;
578 tmp = (BignumDblInt)(a->w[2]) * (b->w[1]);
579 acclo += tmp & BIGNUM_INT_MASK;
581 tmp = (BignumDblInt)(a->w[3]) * (b->w[0]);
582 acclo += tmp & BIGNUM_INT_MASK;
585 acclo = acchi + (acclo >> 16);
587 tmp = (BignumDblInt)(a->w[0]) * (b->w[4]);
588 acclo += tmp & BIGNUM_INT_MASK;
590 tmp = (BignumDblInt)(a->w[1]) * (b->w[3]);
591 acclo += tmp & BIGNUM_INT_MASK;
593 tmp = (BignumDblInt)(a->w[2]) * (b->w[2]);
594 acclo += tmp & BIGNUM_INT_MASK;
596 tmp = (BignumDblInt)(a->w[3]) * (b->w[1]);
597 acclo += tmp & BIGNUM_INT_MASK;
599 tmp = (BignumDblInt)(a->w[4]) * (b->w[0]);
600 acclo += tmp & BIGNUM_INT_MASK;
603 acclo = acchi + (acclo >> 16);
605 tmp = (BignumDblInt)(a->w[0]) * (b->w[5]);
606 acclo += tmp & BIGNUM_INT_MASK;
608 tmp = (BignumDblInt)(a->w[1]) * (b->w[4]);
609 acclo += tmp & BIGNUM_INT_MASK;
611 tmp = (BignumDblInt)(a->w[2]) * (b->w[3]);
612 acclo += tmp & BIGNUM_INT_MASK;
614 tmp = (BignumDblInt)(a->w[3]) * (b->w[2]);
615 acclo += tmp & BIGNUM_INT_MASK;
617 tmp = (BignumDblInt)(a->w[4]) * (b->w[1]);
618 acclo += tmp & BIGNUM_INT_MASK;
620 tmp = (BignumDblInt)(a->w[5]) * (b->w[0]);
621 acclo += tmp & BIGNUM_INT_MASK;
624 acclo = acchi + (acclo >> 16);
626 tmp = (BignumDblInt)(a->w[0]) * (b->w[6]);
627 acclo += tmp & BIGNUM_INT_MASK;
629 tmp = (BignumDblInt)(a->w[1]) * (b->w[5]);
630 acclo += tmp & BIGNUM_INT_MASK;
632 tmp = (BignumDblInt)(a->w[2]) * (b->w[4]);
633 acclo += tmp & BIGNUM_INT_MASK;
635 tmp = (BignumDblInt)(a->w[3]) * (b->w[3]);
636 acclo += tmp & BIGNUM_INT_MASK;
638 tmp = (BignumDblInt)(a->w[4]) * (b->w[2]);
639 acclo += tmp & BIGNUM_INT_MASK;
641 tmp = (BignumDblInt)(a->w[5]) * (b->w[1]);
642 acclo += tmp & BIGNUM_INT_MASK;
644 tmp = (BignumDblInt)(a->w[6]) * (b->w[0]);
645 acclo += tmp & BIGNUM_INT_MASK;
648 acclo = acchi + (acclo >> 16);
650 tmp = (BignumDblInt)(a->w[0]) * (b->w[7]);
651 acclo += tmp & BIGNUM_INT_MASK;
653 tmp = (BignumDblInt)(a->w[1]) * (b->w[6]);
654 acclo += tmp & BIGNUM_INT_MASK;
656 tmp = (BignumDblInt)(a->w[2]) * (b->w[5]);
657 acclo += tmp & BIGNUM_INT_MASK;
659 tmp = (BignumDblInt)(a->w[3]) * (b->w[4]);
660 acclo += tmp & BIGNUM_INT_MASK;
662 tmp = (BignumDblInt)(a->w[4]) * (b->w[3]);
663 acclo += tmp & BIGNUM_INT_MASK;
665 tmp = (BignumDblInt)(a->w[5]) * (b->w[2]);
666 acclo += tmp & BIGNUM_INT_MASK;
668 tmp = (BignumDblInt)(a->w[6]) * (b->w[1]);
669 acclo += tmp & BIGNUM_INT_MASK;
671 tmp = (BignumDblInt)(a->w[7]) * (b->w[0]);
672 acclo += tmp & BIGNUM_INT_MASK;
675 acclo = acchi + (acclo >> 16);
677 tmp = (BignumDblInt)(a->w[0]) * (b->w[8]);
678 acclo += tmp & BIGNUM_INT_MASK;
680 tmp = (BignumDblInt)(a->w[1]) * (b->w[7]);
681 acclo += tmp & BIGNUM_INT_MASK;
683 tmp = (BignumDblInt)(a->w[2]) * (b->w[6]);
684 acclo += tmp & BIGNUM_INT_MASK;
686 tmp = (BignumDblInt)(a->w[3]) * (b->w[5]);
687 acclo += tmp & BIGNUM_INT_MASK;
689 tmp = (BignumDblInt)(a->w[4]) * (b->w[4]);
690 acclo += tmp & BIGNUM_INT_MASK;
692 tmp = (BignumDblInt)(a->w[5]) * (b->w[3]);
693 acclo += tmp & BIGNUM_INT_MASK;
695 tmp = (BignumDblInt)(a->w[6]) * (b->w[2]);
696 acclo += tmp & BIGNUM_INT_MASK;
698 tmp = (BignumDblInt)(a->w[7]) * (b->w[1]);
699 acclo += tmp & BIGNUM_INT_MASK;
701 tmp = (BignumDblInt)(a->w[8]) * (b->w[0]);
702 acclo += tmp & BIGNUM_INT_MASK;
704 r->w[8] = acclo & (((BignumInt)1 << 2)-1);
706 acc2lo += ((acclo >> 2) & (((BignumInt)1 << 14)-1)) * ((BignumDblInt)5 << 0);
707 acclo = acchi + (acclo >> 16);
709 tmp = (BignumDblInt)(a->w[1]) * (b->w[8]);
710 acclo += tmp & BIGNUM_INT_MASK;
712 tmp = (BignumDblInt)(a->w[2]) * (b->w[7]);
713 acclo += tmp & BIGNUM_INT_MASK;
715 tmp = (BignumDblInt)(a->w[3]) * (b->w[6]);
716 acclo += tmp & BIGNUM_INT_MASK;
718 tmp = (BignumDblInt)(a->w[4]) * (b->w[5]);
719 acclo += tmp & BIGNUM_INT_MASK;
721 tmp = (BignumDblInt)(a->w[5]) * (b->w[4]);
722 acclo += tmp & BIGNUM_INT_MASK;
724 tmp = (BignumDblInt)(a->w[6]) * (b->w[3]);
725 acclo += tmp & BIGNUM_INT_MASK;
727 tmp = (BignumDblInt)(a->w[7]) * (b->w[2]);
728 acclo += tmp & BIGNUM_INT_MASK;
730 tmp = (BignumDblInt)(a->w[8]) * (b->w[1]);
731 acclo += tmp & BIGNUM_INT_MASK;
733 acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 14);
737 acc2lo += ((acclo >> 2) & (((BignumInt)1 << 14)-1)) * ((BignumDblInt)5 << 0);
738 acclo = acchi + (acclo >> 16);
740 tmp = (BignumDblInt)(a->w[2]) * (b->w[8]);
741 acclo += tmp & BIGNUM_INT_MASK;
743 tmp = (BignumDblInt)(a->w[3]) * (b->w[7]);
744 acclo += tmp & BIGNUM_INT_MASK;
746 tmp = (BignumDblInt)(a->w[4]) * (b->w[6]);
747 acclo += tmp & BIGNUM_INT_MASK;
749 tmp = (BignumDblInt)(a->w[5]) * (b->w[5]);
750 acclo += tmp & BIGNUM_INT_MASK;
752 tmp = (BignumDblInt)(a->w[6]) * (b->w[4]);
753 acclo += tmp & BIGNUM_INT_MASK;
755 tmp = (BignumDblInt)(a->w[7]) * (b->w[3]);
756 acclo += tmp & BIGNUM_INT_MASK;
758 tmp = (BignumDblInt)(a->w[8]) * (b->w[2]);
759 acclo += tmp & BIGNUM_INT_MASK;
761 acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 14);
765 acc2lo += ((acclo >> 2) & (((BignumInt)1 << 14)-1)) * ((BignumDblInt)5 << 0);
766 acclo = acchi + (acclo >> 16);
768 tmp = (BignumDblInt)(a->w[3]) * (b->w[8]);
769 acclo += tmp & BIGNUM_INT_MASK;
771 tmp = (BignumDblInt)(a->w[4]) * (b->w[7]);
772 acclo += tmp & BIGNUM_INT_MASK;
774 tmp = (BignumDblInt)(a->w[5]) * (b->w[6]);
775 acclo += tmp & BIGNUM_INT_MASK;
777 tmp = (BignumDblInt)(a->w[6]) * (b->w[5]);
778 acclo += tmp & BIGNUM_INT_MASK;
780 tmp = (BignumDblInt)(a->w[7]) * (b->w[4]);
781 acclo += tmp & BIGNUM_INT_MASK;
783 tmp = (BignumDblInt)(a->w[8]) * (b->w[3]);
784 acclo += tmp & BIGNUM_INT_MASK;
786 acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 14);
790 acc2lo += ((acclo >> 2) & (((BignumInt)1 << 14)-1)) * ((BignumDblInt)5 << 0);
791 acclo = acchi + (acclo >> 16);
793 tmp = (BignumDblInt)(a->w[4]) * (b->w[8]);
794 acclo += tmp & BIGNUM_INT_MASK;
796 tmp = (BignumDblInt)(a->w[5]) * (b->w[7]);
797 acclo += tmp & BIGNUM_INT_MASK;
799 tmp = (BignumDblInt)(a->w[6]) * (b->w[6]);
800 acclo += tmp & BIGNUM_INT_MASK;
802 tmp = (BignumDblInt)(a->w[7]) * (b->w[5]);
803 acclo += tmp & BIGNUM_INT_MASK;
805 tmp = (BignumDblInt)(a->w[8]) * (b->w[4]);
806 acclo += tmp & BIGNUM_INT_MASK;
808 acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 14);
812 acc2lo += ((acclo >> 2) & (((BignumInt)1 << 14)-1)) * ((BignumDblInt)5 << 0);
813 acclo = acchi + (acclo >> 16);
815 tmp = (BignumDblInt)(a->w[5]) * (b->w[8]);
816 acclo += tmp & BIGNUM_INT_MASK;
818 tmp = (BignumDblInt)(a->w[6]) * (b->w[7]);
819 acclo += tmp & BIGNUM_INT_MASK;
821 tmp = (BignumDblInt)(a->w[7]) * (b->w[6]);
822 acclo += tmp & BIGNUM_INT_MASK;
824 tmp = (BignumDblInt)(a->w[8]) * (b->w[5]);
825 acclo += tmp & BIGNUM_INT_MASK;
827 acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 14);
831 acc2lo += ((acclo >> 2) & (((BignumInt)1 << 14)-1)) * ((BignumDblInt)5 << 0);
832 acclo = acchi + (acclo >> 16);
834 tmp = (BignumDblInt)(a->w[6]) * (b->w[8]);
835 acclo += tmp & BIGNUM_INT_MASK;
837 tmp = (BignumDblInt)(a->w[7]) * (b->w[7]);
838 acclo += tmp & BIGNUM_INT_MASK;
840 tmp = (BignumDblInt)(a->w[8]) * (b->w[6]);
841 acclo += tmp & BIGNUM_INT_MASK;
843 acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 14);
847 acc2lo += ((acclo >> 2) & (((BignumInt)1 << 14)-1)) * ((BignumDblInt)5 << 0);
848 acclo = acchi + (acclo >> 16);
850 tmp = (BignumDblInt)(a->w[7]) * (b->w[8]);
851 acclo += tmp & BIGNUM_INT_MASK;
853 tmp = (BignumDblInt)(a->w[8]) * (b->w[7]);
854 acclo += tmp & BIGNUM_INT_MASK;
856 acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 14);
860 acc2lo += ((acclo >> 2) & (((BignumInt)1 << 14)-1)) * ((BignumDblInt)5 << 0);
861 acclo = acchi + (acclo >> 16);
863 tmp = (BignumDblInt)(a->w[8]) * (b->w[8]);
864 acclo += tmp & BIGNUM_INT_MASK;
866 acc2lo += (acclo & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 14);
870 acc2lo += ((acclo >> 2) & (((BignumInt)1 << 2)-1)) * ((BignumDblInt)5 << 0);
874 acc2lo += ((acclo >> 4) & (((BignumInt)1 << 12)-1)) * ((BignumDblInt)25 << 0);
875 acclo = acchi + (acclo >> 16);
877 acc2lo += (acclo & (((BignumInt)1 << 4)-1)) * ((BignumDblInt)25 << 12);
881 acc2lo += ((acclo >> 4) & (((BignumInt)1 << 12)-1)) * ((BignumDblInt)25 << 0);
882 acclo = acchi + (acclo >> 16);
909 #error Run contrib/make1305.py again with a different bit count
913 static void bigval_final_reduce(bigval *n)
915 #if BIGNUM_INT_BITS == 64
916 /* ./contrib/make1305.py final_reduce 64 */
919 acclo += 5 * ((n->w[2] >> 2) + 1);
925 acclo = 5 * (acclo >> 2);
935 n->w[2] &= (1 << 2) - 1;
936 #elif BIGNUM_INT_BITS == 32
937 /* ./contrib/make1305.py final_reduce 32 */
940 acclo += 5 * ((n->w[4] >> 2) + 1);
950 acclo = 5 * (acclo >> 2);
966 n->w[4] &= (1 << 2) - 1;
967 #elif BIGNUM_INT_BITS == 16
968 /* ./contrib/make1305.py final_reduce 16 */
971 acclo += 5 * ((n->w[8] >> 2) + 1);
989 acclo = 5 * (acclo >> 2);
1017 n->w[8] &= (1 << 2) - 1;
1019 #error Run contrib/make1305.py again with a different bit count
1024 unsigned char nonce[16];
1028 /* Buffer in case we get less that a multiple of 16 bytes */
1029 unsigned char buffer[16];
1033 static void poly1305_init(struct poly1305 *ctx)
1035 memset(ctx->nonce, 0, 16);
1036 ctx->bufferIndex = 0;
1037 bigval_clear(&ctx->h);
1040 /* Takes a 256 bit key */
1041 static void poly1305_key(struct poly1305 *ctx, const unsigned char *key)
1043 unsigned char key_copy[16];
1044 memcpy(key_copy, key, 16);
1046 /* Key the MAC itself
1047 * bytes 4, 8, 12 and 16 are required to have their top four bits clear */
1048 key_copy[3] &= 0x0f;
1049 key_copy[7] &= 0x0f;
1050 key_copy[11] &= 0x0f;
1051 key_copy[15] &= 0x0f;
1052 /* bytes 5, 9 and 13 are required to have their bottom two bits clear */
1053 key_copy[4] &= 0xfc;
1054 key_copy[8] &= 0xfc;
1055 key_copy[12] &= 0xfc;
1056 bigval_import_le(&ctx->r, key_copy, 16);
1057 smemclr(key_copy, sizeof(key_copy));
1059 /* Use second 128 bits are the nonce */
1060 memcpy(ctx->nonce, key+16, 16);
1063 /* Feed up to 16 bytes (should only be less for the last chunk) */
1064 static void poly1305_feed_chunk(struct poly1305 *ctx,
1065 const unsigned char *chunk, int len)
1068 bigval_import_le(&c, chunk, len);
1069 c.w[len / BIGNUM_INT_BYTES] |=
1070 (BignumInt)1 << (8 * (len % BIGNUM_INT_BYTES));
1071 bigval_add(&c, &c, &ctx->h);
1072 bigval_mul_mod_p(&ctx->h, &c, &ctx->r);
1075 static void poly1305_feed(struct poly1305 *ctx,
1076 const unsigned char *buf, int len)
1078 /* Check for stuff left in the buffer from last time */
1079 if (ctx->bufferIndex) {
1080 /* Try to fill up to 16 */
1081 while (ctx->bufferIndex < 16 && len) {
1082 ctx->buffer[ctx->bufferIndex++] = *buf++;
1085 if (ctx->bufferIndex == 16) {
1086 poly1305_feed_chunk(ctx, ctx->buffer, 16);
1087 ctx->bufferIndex = 0;
1091 /* Process 16 byte whole chunks */
1093 poly1305_feed_chunk(ctx, buf, 16);
1098 /* Cache stuff that's left over */
1100 memcpy(ctx->buffer, buf, len);
1101 ctx->bufferIndex = len;
1105 /* Finalise and populate buffer with 16 byte with MAC */
1106 static void poly1305_finalise(struct poly1305 *ctx, unsigned char *mac)
1110 if (ctx->bufferIndex) {
1111 poly1305_feed_chunk(ctx, ctx->buffer, ctx->bufferIndex);
1114 bigval_import_le(&tmp, ctx->nonce, 16);
1115 bigval_final_reduce(&ctx->h);
1116 bigval_add(&tmp, &tmp, &ctx->h);
1117 bigval_export_le(&tmp, mac, 16);
1122 struct ccp_context {
1123 struct chacha20 a_cipher; /* Used for length */
1124 struct chacha20 b_cipher; /* Used for content */
1126 /* Cache of the first 4 bytes because they are the sequence number */
1127 /* Kept in 8 bytes with the top as zero to allow easy passing to setiv */
1128 int mac_initialised; /* Where we have got to in filling mac_iv */
1129 unsigned char mac_iv[8];
1131 struct poly1305 mac;
1134 static void *poly_make_context(void *ctx)
1139 static void poly_free_context(void *ctx)
1141 /* Not allocated, just forwarded, no need to free */
1144 static void poly_setkey(void *ctx, unsigned char *key)
1146 /* Uses the same context as ChaCha20, so ignore */
1149 static void poly_start(void *handle)
1151 struct ccp_context *ctx = (struct ccp_context *)handle;
1153 ctx->mac_initialised = 0;
1154 memset(ctx->mac_iv, 0, 8);
1155 poly1305_init(&ctx->mac);
1158 static void poly_bytes(void *handle, unsigned char const *blk, int len)
1160 struct ccp_context *ctx = (struct ccp_context *)handle;
1162 /* First 4 bytes are the IV */
1163 while (ctx->mac_initialised < 4 && len) {
1164 ctx->mac_iv[7 - ctx->mac_initialised] = *blk++;
1165 ++ctx->mac_initialised;
1169 /* Initialise the IV if needed */
1170 if (ctx->mac_initialised == 4) {
1171 chacha20_iv(&ctx->b_cipher, ctx->mac_iv);
1172 ++ctx->mac_initialised; /* Don't do it again */
1174 /* Do first rotation */
1175 chacha20_round(&ctx->b_cipher);
1177 /* Set the poly key */
1178 poly1305_key(&ctx->mac, ctx->b_cipher.current);
1180 /* Set the first round as used */
1181 ctx->b_cipher.currentIndex = 64;
1184 /* Update the MAC with anything left */
1186 poly1305_feed(&ctx->mac, blk, len);
1190 static void poly_genresult(void *handle, unsigned char *blk)
1192 struct ccp_context *ctx = (struct ccp_context *)handle;
1193 poly1305_finalise(&ctx->mac, blk);
1196 static int poly_verresult(void *handle, unsigned char const *blk)
1198 struct ccp_context *ctx = (struct ccp_context *)handle;
1200 unsigned char mac[16];
1201 poly1305_finalise(&ctx->mac, mac);
1202 res = smemeq(blk, mac, 16);
1206 /* The generic poly operation used before generate and verify */
1207 static void poly_op(void *handle, unsigned char *blk, int len, unsigned long seq)
1209 unsigned char iv[4];
1211 PUT_32BIT_MSB_FIRST(iv, seq);
1212 /* poly_bytes expects the first 4 bytes to be the IV */
1213 poly_bytes(handle, iv, 4);
1214 smemclr(iv, sizeof(iv));
1215 poly_bytes(handle, blk, len);
1218 static void poly_generate(void *handle, unsigned char *blk, int len, unsigned long seq)
1220 poly_op(handle, blk, len, seq);
1221 poly_genresult(handle, blk+len);
1224 static int poly_verify(void *handle, unsigned char *blk, int len, unsigned long seq)
1226 poly_op(handle, blk, len, seq);
1227 return poly_verresult(handle, blk+len);
1230 static const struct ssh_mac ssh2_poly1305 = {
1231 poly_make_context, poly_free_context,
1234 /* whole-packet operations */
1235 poly_generate, poly_verify,
1237 /* partial-packet operations */
1238 poly_start, poly_bytes, poly_genresult, poly_verresult,
1240 "", "", /* Not selectable individually, just part of ChaCha20-Poly1305 */
1244 static void *ccp_make_context(void)
1246 struct ccp_context *ctx = snew(struct ccp_context);
1248 poly1305_init(&ctx->mac);
1253 static void ccp_free_context(void *vctx)
1255 struct ccp_context *ctx = (struct ccp_context *)vctx;
1256 smemclr(&ctx->a_cipher, sizeof(ctx->a_cipher));
1257 smemclr(&ctx->b_cipher, sizeof(ctx->b_cipher));
1258 smemclr(&ctx->mac, sizeof(ctx->mac));
1262 static void ccp_iv(void *vctx, unsigned char *iv)
1264 /* struct ccp_context *ctx = (struct ccp_context *)vctx; */
1265 /* IV is set based on the sequence number */
1268 static void ccp_key(void *vctx, unsigned char *key)
1270 struct ccp_context *ctx = (struct ccp_context *)vctx;
1271 /* Initialise the a_cipher (for decrypting lengths) with the first 256 bits */
1272 chacha20_key(&ctx->a_cipher, key + 32);
1273 /* Initialise the b_cipher (for content and MAC) with the second 256 bits */
1274 chacha20_key(&ctx->b_cipher, key);
1277 static void ccp_encrypt(void *vctx, unsigned char *blk, int len)
1279 struct ccp_context *ctx = (struct ccp_context *)vctx;
1280 chacha20_encrypt(&ctx->b_cipher, blk, len);
1283 static void ccp_decrypt(void *vctx, unsigned char *blk, int len)
1285 struct ccp_context *ctx = (struct ccp_context *)vctx;
1286 chacha20_decrypt(&ctx->b_cipher, blk, len);
1289 static void ccp_length_op(struct ccp_context *ctx, unsigned char *blk, int len,
1292 unsigned char iv[8];
1294 * According to RFC 4253 (section 6.4), the packet sequence number wraps
1295 * at 2^32, so its 32 high-order bits will always be zero.
1297 PUT_32BIT_LSB_FIRST(iv, 0);
1298 PUT_32BIT_LSB_FIRST(iv + 4, seq);
1299 chacha20_iv(&ctx->a_cipher, iv);
1300 chacha20_iv(&ctx->b_cipher, iv);
1301 /* Reset content block count to 1, as the first is the key for Poly1305 */
1302 ++ctx->b_cipher.state[12];
1303 smemclr(iv, sizeof(iv));
1306 static void ccp_encrypt_length(void *vctx, unsigned char *blk, int len,
1309 struct ccp_context *ctx = (struct ccp_context *)vctx;
1310 ccp_length_op(ctx, blk, len, seq);
1311 chacha20_encrypt(&ctx->a_cipher, blk, len);
1314 static void ccp_decrypt_length(void *vctx, unsigned char *blk, int len,
1317 struct ccp_context *ctx = (struct ccp_context *)vctx;
1318 ccp_length_op(ctx, blk, len, seq);
1319 chacha20_decrypt(&ctx->a_cipher, blk, len);
1322 static const struct ssh2_cipher ssh2_chacha20_poly1305 = {
1333 "chacha20-poly1305@openssh.com",
1334 1, 512, SSH_CIPHER_SEPARATE_LENGTH, "ChaCha20",
1339 static const struct ssh2_cipher *const ccp_list[] = {
1340 &ssh2_chacha20_poly1305
1343 const struct ssh2_ciphers ssh2_ccp = {
1344 sizeof(ccp_list) / sizeof(*ccp_list),