/* SimpleHuffman.c - a simple program that uses Huffman trees to encode files ** see left404.com/2011/08/11/simplehuffman for more information. ** ** Copyright (C) 8/11/2011 Dara Hazeghi ** ** This software may be freely modified and redistributed. It has no warranty. */ #include #include #include #include #define CHAR_RANGE 257 /* max number of character values */ #define FAKE_EOF 256 /* special value to signify end-of-file */ #define CHAR_BITS 8 #define BUF_LEN 1024 /* Huffman tree structure */ typedef struct htree { struct htree *left; struct htree *right; int letter; int freq; } htree; /* a buffer for bits to be written out */ typedef struct { unsigned int bits; /* the actual bits */ unsigned int pos; /* position in the bits array */ unsigned int curBit; /* current position - used for output */ unsigned int availableBits; /* bits available in the slot */ unsigned int buf[BUF_LEN]; /* the whole buffer */ } BITBUFFER; /* compare two Huffman trees based on frequency, descending order */ int CmpTrees(const void *a, const void *b) { const htree **x = (const htree **) a, **y = (const htree **) b; if((*x)->freq == (*y)->freq) return 0; else return ((*x)->freq < (*y)->freq) ? 1 : -1; } /* create a new string with given letter concatenated on to the prefix */ char *Concat(char *prefix, char letter) { char *result = (char *)malloc(strlen(prefix) + 2); sprintf(result, "%s%c", prefix, letter); return result; } /* convert ASCII bitstring to integer bitstring * bitstring is an array of 2 elements - bits at index 0, length at index 1 * bits are left aligned, zero-padded on the right * so the string "1110" would give a bitstring where str[1] = 4 and * str[0] = 1110 0000 0000 0000 0000 0000 0000 0000 (in binary on a BE machine) */ unsigned int *StringToBitstring(char *encoding) { unsigned int *bitstring = (unsigned int *)malloc(2 * sizeof(unsigned int)); unsigned int place = 1; bitstring[0] = 0; bitstring[1] = strlen(encoding); /* doesn't work for larger bitstrings */ assert(bitstring[1] < 32 && bitstring[1] > 0); while(*encoding) { if(*encoding == '1') bitstring[0] |= place; place <<= 1; encoding++; } bitstring[0] <<= (32 - bitstring[1]); return bitstring; } /* get specified number of bits from given bitstring, left-aligned */ unsigned int GetNBits(unsigned int bits, int len, int n) { bits >>= (32-len); /* get rid of trailing junk */ return bits << (32 - n); /* get rid of leading junk */ } /* print specified error message and quite */ void Error(const char *msg) { fprintf(stderr, "Error: %s\n", msg); exit(1); } /* build and return a Huffman tree based on a frequency table */ htree *BuildTree(int frequencies[]) { int i, len = 0; htree *queue[CHAR_RANGE]; /* create trees for each character, add to the queue */ for(i = 0; i < CHAR_RANGE; i++) { if(frequencies[i]) { htree *toadd = (htree *)calloc(1, sizeof(htree)); toadd->letter = i; toadd->freq = frequencies[i]; queue[len++] = toadd; } } while(len > 1) { htree *toadd = (htree *)malloc(sizeof(htree)); /* sort - smallest frequency trees are last */ qsort(queue, len, sizeof(htree *), CmpTrees); /* dequeue two lowest frequency trees, build new tree from them */ toadd->left = queue[--len]; toadd->right = queue[--len]; toadd->freq = toadd->left->freq + toadd->right->freq; queue[len++] = toadd; /* insert back in the queue */ } return queue[0]; /* last tree in the queue is the full Huffman tree */ } /* deallocate given Huffman tree */ void FreeTree(htree *tree) { if(tree) { FreeTree(tree->left); FreeTree(tree->right); free(tree); } } /* traverse the Huffman tree to build up a table of encodings */ void TraverseTree(htree *tree, unsigned int **table, char *prefix) { if(!tree->left && !tree->right) { table[tree->letter] = StringToBitstring(prefix); } else { if(tree->left) TraverseTree(tree->left, table, Concat(prefix, '0')); if(tree-> right) TraverseTree(tree->right, table, Concat(prefix, '1')); } free(prefix); } /* build a table of Huffman bitstring encodings given a set of frequencies */ unsigned int **BuildTable(int frequencies[]) { char *prefix = (char *)calloc(1, sizeof(char)); unsigned int **table = (unsigned int **)calloc(CHAR_RANGE, sizeof(unsigned int *)); htree *tree = BuildTree(frequencies); TraverseTree(tree, table, prefix); return table; } /* deallocate table of Huffman encodings */ void FreeTable(unsigned int *table[]) { int i; for(i = 0; i < CHAR_RANGE; i++) if(table[i]) free(table[i]); free(table); } /* output the Huffman header for an encoded file */ void WriteHeader(FILE *out, int frequencies[]) { int i, count = 0; for(i = 0; i < CHAR_RANGE; i++) if(frequencies[i]) count++; fprintf(out, "%d\n", count); for(i = 0; i < CHAR_RANGE; i++) if(frequencies[i]) fprintf(out, "%d %d\n", i, frequencies[i]); } /* read in the header of a Huffman encoded file */ int *ReadHeader(FILE *in) { static int frequencies[CHAR_RANGE]; int i, count, letter, freq; if(fscanf(in, "%d", &count) != 1) Error("invalid input file."); for(i = 0; i < count; i++) { if((fscanf(in, "%d %d", &letter, &freq) != 2) || letter < 0 || letter >= CHAR_RANGE) Error("invalid input file."); frequencies[letter] = freq; } fgetc(in); /* discard last newline */ return frequencies; } /* write the given bit encoding to the output file */ void WriteBits(unsigned int bits, unsigned int len, FILE *out, BITBUFFER *buffer) { /* enough space in current block for entire bitstring to be appended */ if(len <= buffer->availableBits) { buffer->availableBits -= len; /* append the bits by shifting, then or-ing */ buffer->bits |= (bits >> buffer->availableBits); len = 0; } else /* otherwise fit what we can in the current block */ { buffer->bits |= GetNBits(bits, len, buffer->availableBits); len -= buffer->availableBits; buffer->availableBits = 0; } if(buffer->availableBits == 0) /* if current block is filled */ { buffer->buf[buffer->pos] = buffer->bits; /* add block to buffer */ buffer->pos++; /* move forward */ buffer->bits = 0; /* next block is empty */ buffer->availableBits = 32; /* all bits available */ /* buffer can only be full if current block is full */ if(buffer->pos == BUF_LEN) /* buffer is full, write to disk */ { fwrite(buffer->buf, sizeof(int), BUF_LEN, out); buffer->pos = 0; /* back to start of buffer */ } if(len != 0) /* had some leftovers - place in the next block */ { buffer->availableBits -= len; /* shift off the bits we've already used, add remainder to block */ buffer->bits = bits >> buffer->availableBits; } } } void WriteBits2(unsigned int bits, unsigned int len, FILE *out, BITBUFFER *buffer) { /* enough space in current block for entire bitstring to be appended */ if(len <= buffer->availableBits) { buffer->availableBits -= len; /* append the bits by shifting, then or-ing */ buffer->bits |= (bits >> buffer->availableBits); if(buffer->availableBits == 0) /* if current block is filled */ { buffer->buf[buffer->pos] = buffer->bits; buffer->availableBits = 32; buffer->pos++; buffer->bits = 0; } len = 0; } else /* fit what we can in the current block */ { buffer->buf[buffer->pos] = buffer->bits | GetNBits(bits, len, buffer->availableBits); buffer->pos++; buffer->bits = 0; len -= buffer->availableBits; buffer->availableBits = 32; } if(buffer->pos == BUF_LEN) /* buffer is full, write to disk */ { fwrite(buffer->buf, sizeof(int), BUF_LEN, out); buffer->pos = 0; } if(len != 0) /* had some leftovers - place in the next block */ { buffer->availableBits -= len; /* the shift shifts off the bits we've already used */ buffer->bits = bits >> buffer->availableBits; } } /* write remaining bits in the buffer to the output file */ void FlushBits(BITBUFFER *buffer, FILE *out) { if(buffer->availableBits!=0) /* output buffer isn't empty */ { buffer->buf[buffer->pos] = buffer->bits; if(fwrite(buffer->buf, sizeof(int), buffer->pos+1, out) != buffer->pos+1) Error("could not write to output file"); } } /* read a single bit from the input file */ int ReadBit(FILE *in, BITBUFFER *buffer) { buffer->curBit <<= 1; if(buffer->curBit == 0) /* no more bits left in the current block */ { if(buffer->pos == BUF_LEN) /* no more unread blocks, must fetch */ { fread(buffer->buf, sizeof(int), BUF_LEN, in); buffer->pos = 0; } buffer->bits = buffer->buf[buffer->pos]; buffer->pos++; buffer->curBit = 1; } /* extract bit - this is >2x faster than divide and mode */ return buffer->bits & buffer->curBit; } /* decode and return a single character from the input using the given Huffman * tree */ int DecodeChar(FILE *in, htree *tree, BITBUFFER *buffer) { while(tree->left || tree->right) { if(ReadBit(in, buffer)) tree = tree->right; else tree = tree->left; if(!tree) Error("invalid input file."); } return tree->letter; } /* decode the Huffman-encoded file in and save the results to out */ void Decode(FILE *in, FILE *out) { int *frequencies, c, i = 0; htree *tree; char writebuf[BUF_LEN * sizeof(int)]; BITBUFFER buffer = {0, BUF_LEN, 1<<31, 0}; frequencies = ReadHeader(in); tree = BuildTree(frequencies); while((c = DecodeChar(in, tree, &buffer)) != FAKE_EOF) { writebuf[i++] = c; if(i == BUF_LEN * sizeof(int)) { fwrite(writebuf, 1, i, out); i = 0; } } fwrite(writebuf, 1, i, out); FreeTree(tree); } /* create a Huffman encoding for the file in and save the encoded version to * out */ void Encode(FILE *in, FILE *out) { int len, i, frequencies[CHAR_RANGE] = { 0 }; unsigned int **table; BITBUFFER buffer = { 0, 0, 0, 32}; char readbuf[BUF_LEN * sizeof(int)]; do { len = fread(readbuf, 1, BUF_LEN * sizeof(int), in); for(i = 0; i < len; i++) { frequencies[(unsigned char)readbuf[i]]++; } } while(len == BUF_LEN * sizeof(int)); frequencies[FAKE_EOF] = 1; rewind(in); table = BuildTable(frequencies); WriteHeader(out, frequencies); do { len = fread(readbuf, 1, BUF_LEN * sizeof(int), in); for(i = 0; i < len; i++) WriteBits(table[(unsigned char)readbuf[i]][0], table[(unsigned char)readbuf[i]][1], out, &buffer); } while(len == BUF_LEN * sizeof(int)); /* use FAKE_EOF to indicate end of input */ WriteBits(table[FAKE_EOF][0], table[FAKE_EOF][1], out, &buffer); FlushBits(&buffer, out); FreeTable(table); } /* program to encode and decode files using Huffman trees */ int main(int argc, char *argv[]) { FILE *in, *out; if(argc != 4 || (strcmp(argv[1], "-c") && strcmp(argv[1], "-d"))) { fprintf(stderr, "Usage: %s [-c,-d] infile outfile\n", argv[0]); exit(0); } if(!(in = fopen(argv[2], "rb"))) Error("input file couldn't be opened."); else if((out = fopen(argv[3], "rb"))) Error("output file already exists."); else if(!(out = fopen(argv[3], "wb"))) Error("output file couldn't be opened."); if(!strcmp(argv[1], "-c")) Encode(in, out); else Decode(in, out); fclose(in); fclose(out); return 0; }