characters and huffman

Go To Last Post
7 posts / 0 new
Author
Message
#1
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Guys, I've created a Huffman compression algorithm and it functions correctly with characters, "a,b, c, d, , , z". However, when I try to send data if fails.  Either by freezing or corrupt data.  Anyway, have a quick look at the code.

#include <stdlib.h>
#include "stdint.h"
#include "string.h"
#include "stdio.h"

/** In computer science and information theory, a Huffman code is a particular
 *  type of optimal prefix code that is commonly used for lossless data
 *  compression. The process of finding or using such a code proceeds by means
 *   of Huffman coding, an algorithm developed by David A. Huffman while he
 *   was a Sc.D. student at MIT, and published in the 1952 paper "A Method for
 *    the Construction of Minimum-Redundancy Codes".[1]
 */

int SubTrees;

static struct Huffman_Node * retrieve_small_leaf1(struct Huffman_Node * node, int OffSet)
{

	for (int i = OffSet; ((i != 0) && (node->value == 0)); i--, node--);
	return node;
}

static struct Huffman_Node * retrieve_small_leaf2(struct Huffman_Node * node, int OffSet)
{

	for (int i = OffSet; ((i != 0) && (node->value == 0)); i--, node--);
	return node - 1;
}


static struct Huffman_Node * build_huffman_graph(unsigned char * inRaw, unsigned int Data){


	struct RawDat histogram[256] = {0};


	// clear binary histogram
	for(int i = 0; i < 255; i++){

		histogram[i].repitions = 0;
		histogram[i].dat = 0;
	}


	// derive binary histogram from raw data
	for(int i = 0; i < Data; i++, inRaw++){

		histogram[(unsigned int)* inRaw].repitions++;
		histogram[(unsigned int)* inRaw].dat = * inRaw;
	}


	// find the number of active sub trees
	for(int g = 0; g != 255; g++){
		if(histogram[g].repitions != 0){
			SubTrees++;
		}
	}


	struct RawDat swap;
	int c, d;

	// bubble sort the histogram
	for (c = 0 ; c < 255 - 1; c++){
	    for (d = 0 ; d < 255 - 1; d++){

	    	if ((histogram[d].repitions < histogram[d + 1].repitions)){

	    		memcpy(&swap, &histogram[d], sizeof(struct RawDat));
	    		memcpy(&histogram[d], &histogram[d + 1], sizeof(struct RawDat));
	    		memcpy(&histogram[d + 1], &swap, sizeof(struct RawDat));
	    	}
	    }
	}


	// fill in the data for each leaf
	struct Huffman_Node * node = safe_malloc((sizeof(struct Huffman_Node) * SubTrees));

	for (int f = 0; f < SubTrees; f++){


		node[f].type	= LEAF;
		node[f].value   = histogram[f].repitions;
		node[f].marker  = histogram[f].dat;
		node[f].left    = 0;
		node[f].right   = 0;
	}


	struct Huffman_Node * LeafOne = NULL;
	struct Huffman_Node * LeafTwo = NULL;

	int OffSet = SubTrees;

	while(SubTrees - 1){

		// get the two lowest leafs/branchs
		LeafOne = retrieve_small_leaf1(&node[OffSet - 1], OffSet);
		LeafTwo = retrieve_small_leaf2(&node[OffSet - 1], OffSet);

		struct Huffman_Node * leaf2 = safe_malloc(sizeof(struct Huffman_Node));
		memcpy(leaf2, LeafTwo, sizeof(struct Huffman_Node));



		// create new branch, this branch is also in contention
		LeafTwo->type   = BRANCH;
		LeafTwo->value  = LeafOne->value + leaf2->value;
		LeafTwo->left   = leaf2;
		LeafTwo->right  = LeafOne;
		LeafTwo->marker = '#';

		// remove from contention
		LeafOne->value = 0;



		struct Huffman_Node Temp;

		// quickly resort the remaining leafs and branches
		for (c = 0 ; c < SubTrees - 1; c++){
			for (d = 0 ; d < SubTrees - 1; d++){

				if (node[d].value < node[d + 1].value){

					memcpy(&Temp, &node[d], sizeof(struct Huffman_Node));
					memcpy(&node[d], &node[d + 1], sizeof(struct Huffman_Node));
					memcpy(&node[d + 1], &Temp, sizeof(struct Huffman_Node));
				}
			}
		}
		SubTrees--;
	}

	// tree is complete, return pointer to the tree
	return LeafTwo;
}


/* builds the huffman table with the bits for each entry. 1 stands for binary 0 and 2 for binary 1*/
static void build_huffman_code_table(unsigned int codeTable[], struct Huffman_Node * tree, int Code){

	if (tree->type == LEAF){

		// assign code to leaf
        codeTable[(int)tree->marker] = Code;
	}
    else{

    	// traverse the branches, 1 equals 0 and 2 equals 1
    	build_huffman_code_table(codeTable, tree->left, Code * 10 + 1);
    	build_huffman_code_table(codeTable, tree->right, Code * 10 + 2);
    }
	return;
}


static unsigned int mask(unsigned int bits_remaining){

	unsigned int mask = 0;

	for(int i = 0; i < bits_remaining; i++){
		mask = mask << 1; mask |= 0b00000000000000000000000000000001;
	}

	return mask;
}

static unsigned int codify(unsigned int code){

	char data[33];

	sprintf(data, "%33d", code);

	code = 0x00;

	for(int i = 32; i != 0; i--){

		code = code >> 1;

		if(data[i] == '2'){
			code |= 0b10000000000000000000000000000000;
		}
	}

	return code;
}

static void encode(unsigned char * inRaw, unsigned int * outCoded, unsigned int * codeTable, unsigned int DataSize){

	unsigned char bits_remaining = 32;

	char buf[33];
	int nr_bits = 0;

	while(DataSize--){

		// get Huffman code
		unsigned int code = codeTable[* inRaw++];

		nr_bits = 0;

		// calculate the size of the Huffman code
		itoa(code, buf, 10); nr_bits = strlen(buf);

		// create code byte, in binary
		code = codify(code);



		// if it fits in this byte then add it to the stream
		if(nr_bits <= bits_remaining){

			* outCoded = * outCoded << nr_bits;
			* outCoded |= code;

			bits_remaining -= nr_bits;

			// new stream byte required
			if(bits_remaining == 0){

				bits_remaining = 32;
				outCoded++;
			}
		}

		// does not fit in the stream byte
		else{


			// bytes remaining to add equals i
			nr_bits = nr_bits - bits_remaining;

			// add the top bits to the stream
			* outCoded = * outCoded << bits_remaining;
			* outCoded |= code >> nr_bits;

			// remaining code to be added to the stream
			code = code & mask(nr_bits);

			// next byte,load
			outCoded++;
			bits_remaining = 32;

			if(nr_bits <= bits_remaining){

				// insert Code into stream
				* outCoded = * outCoded << nr_bits;
				* outCoded |= code;

				// calculate bytes remaining
				bits_remaining -= nr_bits;

				// new stream byte required
				if(bits_remaining == 0){

					bits_remaining = 32;
					outCoded++;
				}
			}
		}
	}

	//pad remaining integer code
	* outCoded = * outCoded << bits_remaining;
}


static void decoder(unsigned int * inCoded, unsigned char * outRaw, struct Huffman_Node * Tree, unsigned int NrData){

	unsigned int Bit;
	struct Huffman_Node * node = Tree;


	while(1){

		for(int i = 0; i < 32; i++){

			Bit = * inCoded & 0b10000000000000000000000000000000;

			// move to the next Bit
			* inCoded = * inCoded << 1;

			if(Bit)
				node = node->right;
			else
				node = node->left;

			if(node->type == LEAF){

				// get decoded char and reset the algorithm
				* outRaw++ = node->marker; node = Tree;

				// quit decoding if we have completed
				if(NrData-- == 0)
					return;
			}
		}
		inCoded++;
	}
}

void huffman_encoder(unsigned char * inRaw, unsigned int DataSize, char ** outCoded9){

  	struct Huffman_Node * tree;
  	unsigned int HuffmanTable[256] = {0};


	unsigned char  * outCoded = safe_malloc(230500);

	tree = build_huffman_graph(inRaw, DataSize);
	build_huffman_code_table(HuffmanTable, tree, 0);


	encode(inRaw, (unsigned int *)outCoded, HuffmanTable, DataSize);
	decoder((unsigned int *)outCoded, inRaw, tree, DataSize);
}


 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

How does your encoder tell you how big the compressed output is, and how do you tell the decoder how big the compressed input is?

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

christop wrote:

How does your encoder tell you how big the compressed output is, and how do you tell the decoder how big the compressed input is?

 

 


	encode(inRaw, (unsigned int *)outCoded, HuffmanTable, DataSize);
	decoder((unsigned int *)outCoded, inRaw, tree, DataSize);

The variable DataSize tells us how big the input and output are.

 

    unsigned char  * outCoded = safe_malloc(230500);
 

is just a large buffer at the moment to handle the compressed data output.

Last Edited: Tue. Jan 7, 2020 - 07:32 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

But isn't that the size of the uncompressed data?

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

I'm not interested in the size of the encoder output.  I've assigned a buffer large enough to hold the entire coded message.  

I tell the decoder how big the uncompressed data was and while its decoding we finish when the number of LEAFs matched the DataSize.  

 

			if(node->type == LEAF){

				// get decoded char and reset the algorithm
				* outRaw++ = node->marker; node = Tree;

				// quit decoding if we have completed
				if(NrData-- == 0)
					return;
			}

 

Last Edited: Tue. Jan 7, 2020 - 07:58 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

I assume from your code that your ints are 32-bit. codeTable[] is storing a decimal encoding. A 32-bit int can only store 10 decimal digits. A leaf deeper than 10 nodes is going to have a corrupted code. using a 64-bit unsigned long for codes will store 19 decimal digits which can handle a deeper tree. Using a char[] to store the code has no limitation and will run faster since you don't have to call sprintf() for every code in the compressed stream.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

balisong42 wrote:

I assume from your code that your ints are 32-bit. codeTable[] is storing a decimal encoding. A 32-bit int can only store 10 decimal digits. A leaf deeper than 10 nodes is going to have a corrupted code. using a 64-bit unsigned long for codes will store 19 decimal digits which can handle a deeper tree. Using a char[] to store the code has no limitation and will run faster since you don't have to call sprintf() for every code in the compressed stream.

 

Yup, I'm testing it with an image displaying coloured bars, there are eight colours hence its not a problem of running out of digits for the Huffman code.