/*
 * Zlib (RFC1950 / RFC1951) compression for PuTTY.
 * 
 * There will no doubt be criticism of my decision to reimplement
 * Zlib compression from scratch instead of using the existing zlib
 * code. People will cry `reinventing the wheel'; they'll claim
 * that the `fundamental basis of OSS' is code reuse; they'll want
 * to see a really good reason for me having chosen not to use the
 * existing code.
 * 
 * Well, here are my reasons. Firstly, I don't want to link the
 * whole of zlib into the PuTTY binary; PuTTY is justifiably proud
 * of its small size and I think zlib contains a lot of unnecessary
 * baggage for the kind of compression that SSH requires.
 * 
 * Secondly, I also don't like the alternative of using zlib.dll.
 * Another thing PuTTY is justifiably proud of is its ease of
 * installation, and the last thing I want to do is to start
 * mandating DLLs. Not only that, but there are two _kinds_ of
 * zlib.dll kicking around, one with C calling conventions on the
 * exported functions and another with WINAPI conventions, and
 * there would be a significant danger of getting the wrong one.
 * 
 * Thirdly, there seems to be a difference of opinion on the IETF
 * secsh mailing list about the correct way to round off a
 * compressed packet and start the next. In particular, there's
 * some talk of switching to a mechanism zlib isn't currently
 * capable of supporting (see below for an explanation). Given that
 * sort of uncertainty, I thought it might be better to have code
 * that will support even the zlib-incompatible worst case.
 * 
 * Fourthly, it's a _second implementation_. Second implementations
 * are fundamentally a Good Thing in standardisation efforts. The
 * difference of opinion mentioned above has arisen _precisely_
 * because there has been only one zlib implementation and
 * everybody has used it. I don't intend that this should happen
 * again.
 */

#include <stdlib.h>
#include <assert.h>

/* FIXME */
#include <windows.h>
#include <stdio.h>
#include "putty.h"

#include "ssh.h"

/* ----------------------------------------------------------------------
 * Basic LZ77 code. This bit is designed modularly, so it could be
 * ripped out and used in a different LZ77 compressor. Go to it,
 * and good luck :-)
 */

struct LZ77InternalContext;
struct LZ77Context {
    struct LZ77InternalContext *ictx;
    void *userdata;
    void (*literal)(struct LZ77Context *ctx, unsigned char c);
    void (*match)(struct LZ77Context *ctx, int distance, int len);
};

/*
 * Initialise the private fields of an LZ77Context. It's up to the
 * user to initialise the public fields.
 */
static int lz77_init(struct LZ77Context *ctx);

/*
 * Supply data to be compressed. Will update the private fields of
 * the LZ77Context, and will call literal() and match() to output.
 * If `compress' is FALSE, it will never emit a match, but will
 * instead call literal() for everything.
 */
static void lz77_compress(struct LZ77Context *ctx,
                          unsigned char *data, int len, int compress);

/*
 * Modifiable parameters.
 */
#define WINSIZE 32768		       /* window size. Must be power of 2! */
#define HASHMAX 2039		       /* one more than max hash value */
#define MAXMATCH 32		       /* how many matches we track */
#define HASHCHARS 3		       /* how many chars make a hash */

/*
 * This compressor takes a less slapdash approach than the
 * gzip/zlib one. Rather than allowing our hash chains to fall into
 * disuse near the far end, we keep them doubly linked so we can
 * _find_ the far end, and then every time we add a new byte to the
 * window (thus rolling round by one and removing the previous
 * byte), we can carefully remove the hash chain entry.
 */

#define INVALID -1		       /* invalid hash _and_ invalid offset */
struct WindowEntry {
    short next, prev;		       /* array indices within the window */
    short hashval;
};

struct HashEntry {
    short first;		       /* window index of first in chain */
};

struct Match {
    int distance, len;
};

struct LZ77InternalContext {
    struct WindowEntry win[WINSIZE];
    unsigned char data[WINSIZE];
    int winpos;
    struct HashEntry hashtab[HASHMAX];
    unsigned char pending[HASHCHARS];
    int npending;
};

static int lz77_hash(unsigned char *data) {
    return (257*data[0] + 263*data[1] + 269*data[2]) % HASHMAX;
}

static int lz77_init(struct LZ77Context *ctx) {
    struct LZ77InternalContext *st;
    int i;

    st = (struct LZ77InternalContext *)smalloc(sizeof(*st));
    if (!st)
	return 0;

    ctx->ictx = st;

    for (i = 0; i < WINSIZE; i++)
	st->win[i].next = st->win[i].prev = st->win[i].hashval = INVALID;
    for (i = 0; i < HASHMAX; i++)
	st->hashtab[i].first = INVALID;
    st->winpos = 0;

    st->npending = 0;

    return 1;
}

static void lz77_advance(struct LZ77InternalContext *st,
			 unsigned char c, int hash) {
    int off;

    /*
     * Remove the hash entry at winpos from the tail of its chain,
     * or empty the chain if it's the only thing on the chain.
     */
    if (st->win[st->winpos].prev != INVALID) {
	st->win[st->win[st->winpos].prev].next = INVALID;
    } else if (st->win[st->winpos].hashval != INVALID) {
	st->hashtab[st->win[st->winpos].hashval].first = INVALID;
    }

    /*
     * Create a new entry at winpos and add it to the head of its
     * hash chain.
     */
    st->win[st->winpos].hashval = hash;
    st->win[st->winpos].prev = INVALID;
    off = st->win[st->winpos].next = st->hashtab[hash].first;
    st->hashtab[hash].first = st->winpos;
    if (off != INVALID)
	st->win[off].prev = st->winpos;
    st->data[st->winpos] = c;

    /*
     * Advance the window pointer.
     */
    st->winpos = (st->winpos + 1) & (WINSIZE-1);
}

#define CHARAT(k) ( (k)<0 ? st->data[(st->winpos+k)&(WINSIZE-1)] : data[k] )

static void lz77_compress(struct LZ77Context *ctx,
                          unsigned char *data, int len, int compress) {
    struct LZ77InternalContext *st = ctx->ictx;
    int i, hash, distance, off, nmatch, matchlen, advance;
    struct Match defermatch, matches[MAXMATCH];
    int deferchr;

    /*
     * Add any pending characters from last time to the window. (We
     * might not be able to.)
     */
    for (i = 0; i < st->npending; i++) {
	unsigned char foo[HASHCHARS];
	int j;
	if (len + st->npending - i < HASHCHARS) {
	    /* Update the pending array. */
	    for (j = i; j < st->npending; j++)
		st->pending[j-i] = st->pending[j];
	    break;
	}
	for (j = 0; j < HASHCHARS; j++)
	    foo[j] = (i + j < st->npending ? st->pending[i+j] :
		      data[i + j - st->npending]);
	lz77_advance(st, foo[0], lz77_hash(foo));
    }
    st->npending -= i;

    defermatch.len = 0;
    while (len > 0) {

        /* Don't even look for a match, if we're not compressing. */
        if (compress && len >= HASHCHARS) {
            /*
             * Hash the next few characters.
             */
            hash = lz77_hash(data);

            /*
             * Look the hash up in the corresponding hash chain and see
             * what we can find.
             */
            nmatch = 0;
            for (off = st->hashtab[hash].first;
                 off != INVALID; off = st->win[off].next) {
                /* distance = 1       if off == st->winpos-1 */
                /* distance = WINSIZE if off == st->winpos   */
                distance = WINSIZE - (off + WINSIZE - st->winpos) % WINSIZE;
                for (i = 0; i < HASHCHARS; i++)
                    if (CHARAT(i) != CHARAT(i-distance))
                        break;
                if (i == HASHCHARS) {
                    matches[nmatch].distance = distance;
                    matches[nmatch].len = 3;
                    if (++nmatch >= MAXMATCH)
                        break;
                }
            }
        } else {
            nmatch = 0;
	    hash = INVALID;
	}

	if (nmatch > 0) {
	    /*
	     * We've now filled up matches[] with nmatch potential
	     * matches. Follow them down to find the longest. (We
	     * assume here that it's always worth favouring a
	     * longer match over a shorter one.)
	     */
	    matchlen = HASHCHARS;
	    while (matchlen < len) {
		int j;
		for (i = j = 0; i < nmatch; i++) {
		    if (CHARAT(matchlen) ==
			CHARAT(matchlen - matches[i].distance)) {
			matches[j++] = matches[i];
		    }
		}
		if (j == 0)
		    break;
		matchlen++;
		nmatch = j;
	    }

	    /*
	     * We've now got all the longest matches. We favour the
	     * shorter distances, which means we go with matches[0].
	     * So see if we want to defer it or throw it away.
	     */
	    matches[0].len = matchlen;
	    if (defermatch.len > 0) {
		if (matches[0].len > defermatch.len + 1) {
		    /* We have a better match. Emit the deferred char,
		     * and defer this match. */
		    ctx->literal(ctx, (unsigned char)deferchr);
		    defermatch = matches[0];
		    deferchr = data[0];
		    advance = 1;
		} else {
		    /* We don't have a better match. Do the deferred one. */
		    ctx->match(ctx, defermatch.distance, defermatch.len);
		    advance = defermatch.len - 1;
		    defermatch.len = 0;
		}
	    } else {
		/* There was no deferred match. Defer this one. */
		defermatch = matches[0];
		deferchr = data[0];
		advance = 1;
	    }	    
	} else {
	    /*
	     * We found no matches. Emit the deferred match, if
	     * any; otherwise emit a literal.
	     */
	    if (defermatch.len > 0) {
		ctx->match(ctx, defermatch.distance, defermatch.len);
		advance = defermatch.len - 1;
		defermatch.len = 0;
	    } else {
		ctx->literal(ctx, data[0]);
		advance = 1;
	    }
	}

	/*
	 * Now advance the position by `advance' characters,
	 * keeping the window and hash chains consistent.
	 */
	while (advance > 0) {
	    if (len >= HASHCHARS) {
		lz77_advance(st, *data, lz77_hash(data));
	    } else {
		st->pending[st->npending++] = *data;
	    }
	    data++;
	    len--;
	    advance--;
	}
    }
}

/* ----------------------------------------------------------------------
 * Zlib compression. We always use the static Huffman tree option.
 * Mostly this is because it's hard to scan a block in advance to
 * work out better trees; dynamic trees are great when you're
 * compressing a large file under no significant time constraint,
 * but when you're compressing little bits in real time, things get
 * hairier.
 * 
 * I suppose it's possible that I could compute Huffman trees based
 * on the frequencies in the _previous_ block, as a sort of
 * heuristic, but I'm not confident that the gain would balance out
 * having to transmit the trees.
 */

static struct LZ77Context ectx;

struct Outbuf {
    unsigned char *outbuf;
    int outlen, outsize;
    unsigned long outbits;
    int noutbits;
    int firstblock;
    int comp_disabled;
};

static void outbits(struct Outbuf *out, unsigned long bits, int nbits) {
    assert(out->noutbits + nbits <= 32);
    out->outbits |= bits << out->noutbits;
    out->noutbits += nbits;
    while (out->noutbits >= 8) {
        if (out->outlen >= out->outsize) {
            out->outsize = out->outlen + 64;
            out->outbuf = srealloc(out->outbuf, out->outsize);
        }
        out->outbuf[out->outlen++] = (unsigned char)(out->outbits & 0xFF);
        out->outbits >>= 8;
        out->noutbits -= 8;
    }
}

static const unsigned char mirrorbytes[256] = {
    0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0,
    0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
    0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8,
    0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
    0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4,
    0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
    0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec,
    0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
    0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2,
    0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
    0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea,
    0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
    0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6,
    0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
    0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee,
    0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
    0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1,
    0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
    0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9,
    0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9,
    0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5,
    0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
    0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed,
    0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
    0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3,
    0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3,
    0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb,
    0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
    0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7,
    0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7,
    0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef,
    0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff,
};

typedef struct {
    short code, extrabits;
    int min, max;
} coderecord;

static const coderecord lencodes[] = {
    {257, 0, 3,3},
    {258, 0, 4,4},
    {259, 0, 5,5},
    {260, 0, 6,6},
    {261, 0, 7,7},
    {262, 0, 8,8},
    {263, 0, 9,9},
    {264, 0, 10,10},
    {265, 1, 11,12},
    {266, 1, 13,14},
    {267, 1, 15,16},
    {268, 1, 17,18},
    {269, 2, 19,22},
    {270, 2, 23,26},
    {271, 2, 27,30},
    {272, 2, 31,34},
    {273, 3, 35,42},
    {274, 3, 43,50},
    {275, 3, 51,58},
    {276, 3, 59,66},
    {277, 4, 67,82},
    {278, 4, 83,98},
    {279, 4, 99,114},
    {280, 4, 115,130},
    {281, 5, 131,162},
    {282, 5, 163,194},
    {283, 5, 195,226},
    {284, 5, 227,257},
    {285, 0, 258,258},
};

static const coderecord distcodes[] = {
    {0, 0, 1,1},
    {1, 0, 2,2},
    {2, 0, 3,3},
    {3, 0, 4,4},
    {4, 1, 5,6},
    {5, 1, 7,8},
    {6, 2, 9,12},
    {7, 2, 13,16},
    {8, 3, 17,24},
    {9, 3, 25,32},
    {10, 4, 33,48},
    {11, 4, 49,64},
    {12, 5, 65,96},
    {13, 5, 97,128},
    {14, 6, 129,192},
    {15, 6, 193,256},
    {16, 7, 257,384},
    {17, 7, 385,512},
    {18, 8, 513,768},
    {19, 8, 769,1024},
    {20, 9, 1025,1536},
    {21, 9, 1537,2048},
    {22, 10, 2049,3072},
    {23, 10, 3073,4096},
    {24, 11, 4097,6144},
    {25, 11, 6145,8192},
    {26, 12, 8193,12288},
    {27, 12, 12289,16384},
    {28, 13, 16385,24576},
    {29, 13, 24577,32768},
};

static void zlib_literal(struct LZ77Context *ectx, unsigned char c) {
    struct Outbuf *out = (struct Outbuf *)ectx->userdata;

    if (out->comp_disabled) {
        /*
         * We're in an uncompressed block, so just output the byte.
         */
        outbits(out, c, 8);
        return;
    }

    if (c <= 143) {
        /* 0 through 143 are 8 bits long starting at 00110000. */
        outbits(out, mirrorbytes[0x30 + c], 8);
    } else {
        /* 144 through 255 are 9 bits long starting at 110010000. */
        outbits(out, 1 + 2*mirrorbytes[0x90 - 144 + c], 9);
    }
}

static void zlib_match(struct LZ77Context *ectx, int distance, int len) {
    const coderecord *d, *l;
    int i, j, k;
    struct Outbuf *out = (struct Outbuf *)ectx->userdata;

    assert(!out->comp_disabled);

    while (len > 0) {
        int thislen;
	
	/*
	 * We can transmit matches of lengths 3 through 258
	 * inclusive. So if len exceeds 258, we must transmit in
	 * several steps, with 258 or less in each step.
	 * 
	 * Specifically: if len >= 261, we can transmit 258 and be
	 * sure of having at least 3 left for the next step. And if
	 * len <= 258, we can just transmit len. But if len == 259
	 * or 260, we must transmit len-3.
	 */
	thislen = (len > 260 ? 258 : len <= 258 ? len : len-3);
        len -= thislen;

        /*
         * Binary-search to find which length code we're
         * transmitting.
         */
        i = -1; j = sizeof(lencodes)/sizeof(*lencodes);
        while (j - i >= 2) {
            k = (j+i)/2;
            if (thislen < lencodes[k].min)
                j = k;
            else if (thislen > lencodes[k].max)
                i = k;
            else {
                l = &lencodes[k];
                break;                 /* found it! */
            }
        }

        /*
         * Transmit the length code. 256-279 are seven bits
         * starting at 0000000; 280-287 are eight bits starting at
         * 11000000.
         */
        if (l->code <= 279) {
            outbits(out, mirrorbytes[(l->code-256)*2], 7);
        } else {
            outbits(out, mirrorbytes[0xc0 - 280 + l->code], 8);
        }

        /*
         * Transmit the extra bits.
         */
        if (l->extrabits)
            outbits(out, thislen - l->min, l->extrabits);

        /*
         * Binary-search to find which distance code we're
         * transmitting.
         */
        i = -1; j = sizeof(distcodes)/sizeof(*distcodes);
        while (j - i >= 2) {
            k = (j+i)/2;
            if (distance < distcodes[k].min)
                j = k;
            else if (distance > distcodes[k].max)
                i = k;
            else {
                d = &distcodes[k];
                break;                 /* found it! */
            }
        }

        /*
         * Transmit the distance code. Five bits starting at 00000.
         */
        outbits(out, mirrorbytes[d->code*8], 5);

        /*
         * Transmit the extra bits.
         */
        if (d->extrabits)
            outbits(out, distance - d->min, d->extrabits);
    }
}

void zlib_compress_init(void) {
    struct Outbuf *out;

    lz77_init(&ectx);
    ectx.literal = zlib_literal;
    ectx.match = zlib_match;

    out = smalloc(sizeof(struct Outbuf));
    out->outbits = out->noutbits = 0;
    out->firstblock = 1;
    out->comp_disabled = FALSE;
    ectx.userdata = out;

    logevent("Initialised zlib (RFC1950) compression");
}

/*
 * Turn off actual LZ77 analysis for one block, to facilitate
 * construction of a precise-length IGNORE packet. Returns the
 * length adjustment (which is only valid for packets < 65536
 * bytes, but that seems reasonable enough).
 */
int zlib_disable_compression(void) {
    struct Outbuf *out = (struct Outbuf *)ectx.userdata;
    int n;

    out->comp_disabled = TRUE;

    n = 0;
    /*
     * If this is the first block, we will start by outputting two
     * header bytes, and then three bits to begin an uncompressed
     * block. This will cost three bytes (because we will start on
     * a byte boundary, this is certain).
     */
    if (out->firstblock) {
        n = 3;
    } else {
        /*
         * Otherwise, we will output seven bits to close the
         * previous static block, and _then_ three bits to begin an
         * uncompressed block, and then flush the current byte.
         * This may cost two bytes or three, depending on noutbits.
         */
        n += (out->noutbits + 10) / 8;
    }

    /*
     * Now we output four bytes for the length / ~length pair in
     * the uncompressed block.
     */
    n += 4;

    return n;
}

int zlib_compress_block(unsigned char *block, int len,
			unsigned char **outblock, int *outlen) {
    struct Outbuf *out = (struct Outbuf *)ectx.userdata;
    int in_block;

    out->outbuf = NULL;
    out->outlen = out->outsize = 0;

    /*
     * If this is the first block, output the Zlib (RFC1950) header
     * bytes 78 9C. (Deflate compression, 32K window size, default
     * algorithm.)
     */
    if (out->firstblock) {
        outbits(out, 0x9C78, 16);
        out->firstblock = 0;

        in_block = FALSE;
    }

    if (out->comp_disabled) {
        if (in_block)
            outbits(out, 0, 7);		       /* close static block */

        while (len > 0) {
            int blen = (len < 65535 ? len : 65535);

            /*
             * Start a Deflate (RFC1951) uncompressed block. We
             * transmit a zero bit (BFINAL=0), followed by a zero
             * bit and a one bit (BTYPE=00). Of course these are in
             * the wrong order (00 0).
             */
            outbits(out, 0, 3);

            /*
             * Output zero bits to align to a byte boundary.
             */
            if (out->noutbits)
                outbits(out, 0, 8 - out->noutbits);

            /*
             * Output the block length, and then its one's
             * complement. They're little-endian, so all we need to
             * do is pass them straight to outbits() with bit count
             * 16.
             */
            outbits(out, blen, 16);
            outbits(out, blen ^ 0xFFFF, 16);

            /*
             * Do the `compression': we need to pass the data to
             * lz77_compress so that it will be taken into account
             * for subsequent (distance,length) pairs. But
             * lz77_compress is passed FALSE, which means it won't
             * actually find (or even look for) any matches; so
             * every character will be passed straight to
             * zlib_literal which will spot out->comp_disabled and
             * emit in the uncompressed format.
             */
            lz77_compress(&ectx, block, blen, FALSE);

            len -= blen;
            block += blen;
        }
        outbits(out, 2, 3);		       /* open new block */
    } else {
        if (!in_block) {
            /*
             * Start a Deflate (RFC1951) fixed-trees block. We
             * transmit a zero bit (BFINAL=0), followed by a zero
             * bit and a one bit (BTYPE=01). Of course these are in
             * the wrong order (01 0).
             */
            outbits(out, 2, 3);
        }

        /*
         * Do the compression.
         */
        lz77_compress(&ectx, block, len, TRUE);

        /*
         * End the block (by transmitting code 256, which is
         * 0000000 in fixed-tree mode), and transmit some empty
         * blocks to ensure we have emitted the byte containing the
         * last piece of genuine data. There are three ways we can
         * do this:
         *
         *  - Minimal flush. Output end-of-block and then open a
         *    new static block. This takes 9 bits, which is
         *    guaranteed to flush out the last genuine code in the
         *    closed block; but allegedly zlib can't handle it.
         *
         *  - Zlib partial flush. Output EOB, open and close an
         *    empty static block, and _then_ open the new block.
         *    This is the best zlib can handle.
         *
         *  - Zlib sync flush. Output EOB, then an empty
         *    _uncompressed_ block (000, then sync to byte
         *    boundary, then send bytes 00 00 FF FF). Then open the
         *    new block.
         *
         * For the moment, we will use Zlib partial flush.
         */
        outbits(out, 0, 7);		       /* close block */
        outbits(out, 2, 3+7);	       /* empty static block */
        outbits(out, 2, 3);		       /* open new block */
    }

    out->comp_disabled = FALSE;

    *outblock = out->outbuf;
    *outlen = out->outlen;

    return 1;
}

/* ----------------------------------------------------------------------
 * Zlib decompression. Of course, even though our compressor always
 * uses static trees, our _decompressor_ has to be capable of
 * handling dynamic trees if it sees them.
 */

/*
 * The way we work the Huffman decode is to have a table lookup on
 * the first N bits of the input stream (in the order they arrive,
 * of course, i.e. the first bit of the Huffman code is in bit 0).
 * Each table entry lists the number of bits to consume, plus
 * either an output code or a pointer to a secondary table.
 */
struct zlib_table;
struct zlib_tableentry;

struct zlib_tableentry {
    unsigned char nbits;
    short code;
    struct zlib_table *nexttable;
};

struct zlib_table {
    int mask;                          /* mask applied to input bit stream */
    struct zlib_tableentry *table;
};

#define MAXCODELEN 16
#define MAXSYMS 288

/*
 * Build a single-level decode table for elements
 * [minlength,maxlength) of the provided code/length tables, and
 * recurse to build subtables.
 */
static struct zlib_table *zlib_mkonetab(int *codes, unsigned char *lengths,
                                        int nsyms,
                                        int pfx, int pfxbits, int bits) {
    struct zlib_table *tab = smalloc(sizeof(struct zlib_table));
    int pfxmask = (1 << pfxbits) - 1;
    int nbits, i, j, code;

    tab->table = smalloc((1 << bits) * sizeof(struct zlib_tableentry));
    tab->mask = (1 << bits) - 1;

    for (code = 0; code <= tab->mask; code++) {
        tab->table[code].code = -1;
        tab->table[code].nbits = 0;
        tab->table[code].nexttable = NULL;
    }

    for (i = 0; i < nsyms; i++) {
        if (lengths[i] <= pfxbits || (codes[i] & pfxmask) != pfx)
            continue;
        code = (codes[i] >> pfxbits) & tab->mask;
        for (j = code; j <= tab->mask; j += 1 << (lengths[i]-pfxbits)) {
            tab->table[j].code = i;
            nbits = lengths[i] - pfxbits;
            if (tab->table[j].nbits < nbits)
                tab->table[j].nbits = nbits;
        }
    }
    for (code = 0; code <= tab->mask; code++) {
        if (tab->table[code].nbits <= bits)
            continue;
        /* Generate a subtable. */
        tab->table[code].code = -1;
        nbits = tab->table[code].nbits - bits;
        if (nbits > 7)
            nbits = 7;
        tab->table[code].nbits = bits;
        tab->table[code].nexttable = zlib_mkonetab(codes, lengths, nsyms,
                                                   pfx | (code << pfxbits),
                                                   pfxbits + bits, nbits);
    }

    return tab;
}

/*
 * Build a decode table, given a set of Huffman tree lengths.
 */
static struct zlib_table *zlib_mktable(unsigned char *lengths, int nlengths) {
    int count[MAXCODELEN], startcode[MAXCODELEN], codes[MAXSYMS];
    int code, maxlen;
    int i, j;

    /* Count the codes of each length. */
    maxlen = 0;
    for (i = 1; i < MAXCODELEN; i++) count[i] = 0;
    for (i = 0; i < nlengths; i++) {
        count[lengths[i]]++;
        if (maxlen < lengths[i])
            maxlen = lengths[i];
    }
    /* Determine the starting code for each length block. */
    code = 0;
    for (i = 1; i < MAXCODELEN; i++) {
        startcode[i] = code;
        code += count[i];
        code <<= 1;
    }
    /* Determine the code for each symbol. Mirrored, of course. */
    for (i = 0; i < nlengths; i++) {
        code = startcode[lengths[i]]++;
        codes[i] = 0;
        for (j = 0; j < lengths[i]; j++) {
            codes[i] = (codes[i] << 1) | (code & 1);
            code >>= 1;
        }
    }

    /*
     * Now we have the complete list of Huffman codes. Build a
     * table.
     */
    return zlib_mkonetab(codes, lengths, nlengths, 0, 0,
                         maxlen < 9 ? maxlen : 9);
}

static int zlib_freetable(struct zlib_table ** ztab) {
    struct zlib_table *tab;
    int code;

    if (ztab == NULL)
	return -1;

    if (*ztab == NULL)
	return 0;

    tab = *ztab;

    for (code = 0; code <= tab->mask; code++)
	if (tab->table[code].nexttable != NULL)
	    zlib_freetable(&tab->table[code].nexttable);

    sfree(tab->table);
    tab->table = NULL;

    sfree(tab);
    *ztab = NULL;

    return(0);
}

static struct zlib_decompress_ctx {
    struct zlib_table *staticlentable, *staticdisttable;
    struct zlib_table *currlentable, *currdisttable, *lenlentable;
    enum {
        START, OUTSIDEBLK,
        TREES_HDR, TREES_LENLEN, TREES_LEN, TREES_LENREP,
        INBLK, GOTLENSYM, GOTLEN, GOTDISTSYM,
        UNCOMP_LEN, UNCOMP_NLEN, UNCOMP_DATA
    } state;
    int sym, hlit, hdist, hclen, lenptr, lenextrabits, lenaddon, len, lenrep;
    int uncomplen;
    unsigned char lenlen[19];
    unsigned char lengths[286+32];
    unsigned long bits;
    int nbits;
    unsigned char window[WINSIZE];
    int winpos;
    unsigned char *outblk;
    int outlen, outsize;
} dctx;

void zlib_decompress_init(void) {
    unsigned char lengths[288];
    memset(lengths, 8, 144);
    memset(lengths+144, 9, 256-144);
    memset(lengths+256, 7, 280-256);
    memset(lengths+280, 8, 288-280);
    dctx.staticlentable = zlib_mktable(lengths, 288);
    memset(lengths, 5, 32);
    dctx.staticdisttable = zlib_mktable(lengths, 32);
    dctx.state = START;                /* even before header */
    dctx.currlentable = dctx.currdisttable = dctx.lenlentable = NULL;
    dctx.bits = 0;
    dctx.nbits = 0;
    logevent("Initialised zlib (RFC1950) decompression");
}

int zlib_huflookup(unsigned long *bitsp, int *nbitsp, struct zlib_table *tab) {
    unsigned long bits = *bitsp;
    int nbits = *nbitsp;
    while (1) {
        struct zlib_tableentry *ent;
        ent = &tab->table[bits & tab->mask];
        if (ent->nbits > nbits)
            return -1;                 /* not enough data */
        bits >>= ent->nbits;
        nbits -= ent->nbits;
        if (ent->code == -1)
            tab = ent->nexttable;
        else {
            *bitsp = bits;
            *nbitsp = nbits;
            return ent->code;
        }
    }
}

static void zlib_emit_char(int c) {
    dctx.window[dctx.winpos] = c;
    dctx.winpos = (dctx.winpos + 1) & (WINSIZE-1);
    if (dctx.outlen >= dctx.outsize) {
	dctx.outsize = dctx.outlen + 512;
	dctx.outblk = srealloc(dctx.outblk, dctx.outsize);
    }
    dctx.outblk[dctx.outlen++] = c;
}

#define EATBITS(n) ( dctx.nbits -= (n), dctx.bits >>= (n) )

int zlib_decompress_block(unsigned char *block, int len,
			  unsigned char **outblock, int *outlen) {
    const coderecord *rec;
    int code, blktype, rep, dist, nlen;
    static const unsigned char lenlenmap[] = {
        16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
    };

    dctx.outblk = NULL;
    dctx.outsize = dctx.outlen = 0;

    while (len > 0 || dctx.nbits > 0) {
        while (dctx.nbits < 24 && len > 0) {
            dctx.bits |= (*block++) << dctx.nbits;
            dctx.nbits += 8;
            len--;
        }
        switch (dctx.state) {
          case START:
            /* Expect 16-bit zlib header, which we'll dishonourably ignore. */
            if (dctx.nbits < 16)
                goto finished;	       /* done all we can */
            EATBITS(16);
            dctx.state = OUTSIDEBLK;
            break;
          case OUTSIDEBLK:
            /* Expect 3-bit block header. */
            if (dctx.nbits < 3)
                goto finished;	       /* done all we can */
            EATBITS(1);
            blktype = dctx.bits & 3;
            EATBITS(2);
            if (blktype == 0) {
		int to_eat = dctx.nbits & 7;
                dctx.state = UNCOMP_LEN;
		EATBITS(to_eat);       /* align to byte boundary */
            } else if (blktype == 1) {
                dctx.currlentable = dctx.staticlentable;
                dctx.currdisttable = dctx.staticdisttable;
                dctx.state = INBLK;
            } else if (blktype == 2) {
                dctx.state = TREES_HDR;
            }
            break;
          case TREES_HDR:
            /*
             * Dynamic block header. Five bits of HLIT, five of
             * HDIST, four of HCLEN.
             */
            if (dctx.nbits < 5+5+4)
                goto finished;	       /* done all we can */
            dctx.hlit = 257 + (dctx.bits & 31); EATBITS(5);
            dctx.hdist = 1 + (dctx.bits & 31); EATBITS(5);
            dctx.hclen = 4 + (dctx.bits & 15); EATBITS(4);
            dctx.lenptr = 0;
            dctx.state = TREES_LENLEN;
            memset(dctx.lenlen, 0, sizeof(dctx.lenlen));
            break;
          case TREES_LENLEN:
            if (dctx.nbits < 3)
                goto finished;
            while (dctx.lenptr < dctx.hclen && dctx.nbits >= 3) {
                dctx.lenlen[lenlenmap[dctx.lenptr++]] =
		    (unsigned char)(dctx.bits & 7);
                EATBITS(3);
            }
            if (dctx.lenptr == dctx.hclen) {
                dctx.lenlentable = zlib_mktable(dctx.lenlen, 19);
                dctx.state = TREES_LEN;
                dctx.lenptr = 0;
            }
            break;
          case TREES_LEN:
            if (dctx.lenptr >= dctx.hlit+dctx.hdist) {
                dctx.currlentable = zlib_mktable(dctx.lengths, dctx.hlit);
                dctx.currdisttable = zlib_mktable(dctx.lengths + dctx.hlit,
                                                  dctx.hdist);
		zlib_freetable(&dctx.lenlentable);
		dctx.state = INBLK;
                break;
            }
            code = zlib_huflookup(&dctx.bits, &dctx.nbits, dctx.lenlentable);
            if (code == -1)
                goto finished;
            if (code < 16)
                dctx.lengths[dctx.lenptr++] = code;
            else {
                dctx.lenextrabits = (code == 16 ? 2 : code == 17 ? 3 : 7);
                dctx.lenaddon = (code == 18 ? 11 : 3);
                dctx.lenrep = (code == 16 && dctx.lenptr > 0 ?
                               dctx.lengths[dctx.lenptr-1] : 0);
                dctx.state = TREES_LENREP;
            }
            break;
          case TREES_LENREP:
            if (dctx.nbits < dctx.lenextrabits)
                goto finished;
            rep = dctx.lenaddon + (dctx.bits & ((1<<dctx.lenextrabits)-1));
            EATBITS(dctx.lenextrabits);
            while (rep > 0 && dctx.lenptr < dctx.hlit+dctx.hdist) {
                dctx.lengths[dctx.lenptr] = dctx.lenrep;
                dctx.lenptr++;
                rep--;
            }
            dctx.state = TREES_LEN;
            break;
          case INBLK:
            code = zlib_huflookup(&dctx.bits, &dctx.nbits, dctx.currlentable);
            if (code == -1)
                goto finished;
            if (code < 256)
		zlib_emit_char(code);
            else if (code == 256) {
                dctx.state = OUTSIDEBLK;
		if (dctx.currlentable != dctx.staticlentable)
		    zlib_freetable(&dctx.currlentable);
		if (dctx.currdisttable != dctx.staticdisttable)
		    zlib_freetable(&dctx.currdisttable);
            } else if (code < 286) {   /* static tree can give >285; ignore */
                dctx.state = GOTLENSYM;
                dctx.sym = code;
            }
            break;
          case GOTLENSYM:
            rec = &lencodes[dctx.sym - 257];
            if (dctx.nbits < rec->extrabits)
                goto finished;
            dctx.len = rec->min + (dctx.bits & ((1<<rec->extrabits)-1));
            EATBITS(rec->extrabits);
            dctx.state = GOTLEN;
            break;
          case GOTLEN:
            code = zlib_huflookup(&dctx.bits, &dctx.nbits, dctx.currdisttable);
            if (code == -1)
                goto finished;
            dctx.state = GOTDISTSYM;
            dctx.sym = code;
            break;
          case GOTDISTSYM:
            rec = &distcodes[dctx.sym];
            if (dctx.nbits < rec->extrabits)
                goto finished;
            dist = rec->min + (dctx.bits & ((1<<rec->extrabits)-1));
            EATBITS(rec->extrabits);
            dctx.state = INBLK;
	    while (dctx.len--)
		zlib_emit_char(dctx.window[(dctx.winpos-dist) & (WINSIZE-1)]);
	    break;
	  case UNCOMP_LEN:
	    /*
	     * Uncompressed block. We expect to see a 16-bit LEN.
	     */
	    if (dctx.nbits < 16)
		goto finished;
	    dctx.uncomplen = dctx.bits & 0xFFFF;
	    EATBITS(16);
	    dctx.state = UNCOMP_NLEN;
	    break;
	  case UNCOMP_NLEN:
	    /*
	     * Uncompressed block. We expect to see a 16-bit NLEN,
	     * which should be the one's complement of the previous
	     * LEN.
	     */
	    if (dctx.nbits < 16)
		goto finished;
	    nlen = dctx.bits & 0xFFFF;
	    EATBITS(16);
	    dctx.state = UNCOMP_DATA;
	    break;
	  case UNCOMP_DATA:
	    if (dctx.nbits < 8)
		goto finished;
	    zlib_emit_char(dctx.bits & 0xFF);
	    EATBITS(8);
	    if (--dctx.uncomplen == 0)
		dctx.state = OUTSIDEBLK;   /* end of uncompressed block */
	    break;
        }
    }

    finished:
    *outblock = dctx.outblk;
    *outlen = dctx.outlen;

    return 1;
}

const struct ssh_compress ssh_zlib = {
    "zlib",
    zlib_compress_init,
    zlib_compress_block,
    zlib_decompress_init,
    zlib_decompress_block,
    zlib_disable_compression
};