Huffman Glitch

Go To Last Post
6 posts / 0 new
Author
Message
#1
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Guys, I have designed a compression routine that uses the Huffman tree.  It appears to work well enough but some characters such as "despite all the ‘predictions’ to the contrary" comes out "despite all the ‘predictions’ to the contrary".

#include "../../applications/compression/huffman.h"

#include <stdlib.h>
#include "stdint.h"
#include "string.h"
#include "stdio.h"

#include "../../arm_rtos/system_malloc.h"
#include "../../arm_rtos/serial_console/c_printf.h"

/** In computer science and information theory, a Huffman code is a particular
 *  type of optimal prefix code that is commonly used for lossless data
 *  compression. The process of finding or using such a code proceeds by means
 *   of Huffman coding, an algorithm developed by David A. Huffman while he
 *   was a Sc.D. student at MIT, and published in the 1952 paper "A Method for
 *    the Construction of Minimum-Redundancy Codes".[1]
 */

int32_t SubTrees;

static struct Huffman_Node * retrieve_small_leaf1(struct Huffman_Node * node, int32_t OffSet)
{

	for (int32_t i = OffSet; ((i != 0) && (node->value == 0)); i--, node--);
	return node;
}

static struct Huffman_Node * retrieve_small_leaf2(struct Huffman_Node * node, int32_t OffSet)
{

	for (int32_t i = OffSet; ((i != 0) && (node->value == 0)); i--, node--);
	return node - 1;
}

struct Huffman_Node * build_huffman_graph(uint8_t * inRaw, uint32_t Data){

	struct RawDat histogram[256] = {0};

	// clear binary histogram
	for(int32_t i = 0; i < 255; i++){

		histogram[i].repitions = 0;
		histogram[i].dat = 0;
	}

	// derive binary histogram from raw data
	for(int32_t i = 0; i < Data; i++, inRaw++){

		histogram[(uint32_t)* inRaw].repitions++;
		histogram[(uint32_t)* inRaw].dat = * inRaw;
	}

	// find the number of active sub trees
	for(int32_t g = 0; g != 255; g++){
		if(histogram[g].repitions != 0){
			SubTrees++;
		}
	}

	struct RawDat swap;
	int32_t c, d;

	// bubble sort the histogram
	for (c = 0 ; c < 255 - 1; c++){
	    for (d = 0 ; d < 255 - 1; d++){

	    	if ((histogram[d].repitions < histogram[d + 1].repitions)){

	    		memcpy(&swap, &histogram[d], sizeof(struct RawDat));
	    		memcpy(&histogram[d], &histogram[d + 1], sizeof(struct RawDat));
	    		memcpy(&histogram[d + 1], &swap, sizeof(struct RawDat));
	    	}
	    }
	}

	// fill in the data for each leaf
	struct Huffman_Node * node = rtos_malloc((sizeof(struct Huffman_Node) * SubTrees));

	for (int32_t f = 0; f < SubTrees; f++){

		node[f].type	= LEAF;
		node[f].value   = histogram[f].repitions;
		node[f].marker  = histogram[f].dat;
		node[f].left    = 0;
		node[f].right   = 0;
	}

	struct Huffman_Node * LeafOne = NULL;
	struct Huffman_Node * LeafTwo = NULL;

	int32_t OffSet = SubTrees;

	while(SubTrees - 1){

		// get the two lowest leafs/branchs
		LeafOne = retrieve_small_leaf1(&node[OffSet - 1], OffSet);
		LeafTwo = retrieve_small_leaf2(&node[OffSet - 1], OffSet);

		struct Huffman_Node * leaf2 = rtos_malloc(sizeof(struct Huffman_Node));
		memcpy(leaf2, LeafTwo, sizeof(struct Huffman_Node));

		// create new branch, this branch is also in contention
		LeafTwo->type   = BRANCH;
		LeafTwo->value  = LeafOne->value + leaf2->value;
		LeafTwo->left   = leaf2;
		LeafTwo->right  = LeafOne;
		LeafTwo->marker = '#';

		// remove from contention
		LeafOne->value = 0;

		struct Huffman_Node Temp;

		// quickly resort the remaining leafs and branches
		for (c = 0 ; c < SubTrees - 1; c++){
			for (d = 0 ; d < SubTrees - 1; d++){

				if (node[d].value < node[d + 1].value){

					memcpy(&Temp, &node[d], sizeof(struct Huffman_Node));
					memcpy(&node[d], &node[d + 1], sizeof(struct Huffman_Node));
					memcpy(&node[d + 1], &Temp, sizeof(struct Huffman_Node));
				}
			}
		}
		SubTrees--;
	}

	// tree is complete, return pointer to the tree
	return LeafTwo;
}

/* builds the huffman table with the bits for each entry. 1 stands for binary 0 and 2 for binary 1*/
void build_huffman_code_table(uint32_t codeTable[], struct Huffman_Node * tree, int32_t Code){

	if (tree->type == LEAF){

		// assign code to leaf
        codeTable[(int)tree->marker] = Code;
	}
    else{

    	// traverse the branches, 1 equals 0 and 2 equals 1
    	build_huffman_code_table(codeTable, tree->left, Code * 10 + 1);
    	build_huffman_code_table(codeTable, tree->right, Code * 10 + 2);
    }
	return;
}

static uint32_t mask(uint32_t bits_remaining){

	uint32_t mask = 0;

	for(int32_t i = 0; i < bits_remaining; i++){
		mask = mask << 1; mask |= 0b00000001;
	}

	return mask;
}

static uint16_t codify(uint32_t code){

	char data[34];

	sprintf(data, "%33d", (int)code);

	code = 0x00;

	for(int32_t i = 32; i != 0; i--){

		code = code >> 1;

		if(data[i] == '2'){
			code |= 0b10000000000000000000000000000000;
		}
	}

	return code;
}

uint32_t encode(uint8_t * inRaw, uint32_t * outCoded, uint32_t * codeTable, uint32_t DataSize){

	uint8_t bits_remaining = 32;
	uint32_t compression_size = 0;

	char buf[16];
	int32_t nr_bits = 0;

	while(DataSize--){

		// get Huffman code
		uint32_t code = codeTable[* inRaw++];

		nr_bits = 0;

		// calculate the size of the Huffman code
		itoa(code, buf, 10); nr_bits = strlen(buf);

		// create code byte, in binary
		code = codify(code);

		// if it fits in this byte then add it to the stream
		if(nr_bits <= bits_remaining){

			* outCoded = * outCoded << nr_bits;
			* outCoded |= code;

			bits_remaining -= nr_bits;

			// new stream byte required
			if(bits_remaining == 0){

				bits_remaining = 32;

				outCoded++;
				compression_size++;
			}
		}

		// does not fit in the stream byte
		else{

			// bytes remaining to add equals i
			nr_bits = nr_bits - bits_remaining;

			// add the top bits to the stream
			* outCoded = * outCoded << bits_remaining;
			* outCoded |= code >> nr_bits;

			// remaining code to be added to the stream
			code = code & mask(nr_bits);

			// next byte,load
			outCoded++;
			compression_size++;

			bits_remaining = 32;

			if(nr_bits <= bits_remaining){

				// insert Code into stream
				* outCoded = * outCoded << nr_bits;
				* outCoded |= code;

				// calculate bytes remaining
				bits_remaining -= nr_bits;

				// new stream byte required
				if(bits_remaining == 0){

					bits_remaining = 32;
					outCoded++;
					compression_size++;
				}
			}
		}
	}

	//pad remaining integer code
	* outCoded = * outCoded << bits_remaining;
	return compression_size;
}

void decoder(uint32_t * inCoded, uint8_t * outRaw, struct Huffman_Node * Tree, uint32_t NrData){

	uint32_t Bit;
	struct Huffman_Node * node = Tree;

	while(1){

		for(int32_t i = 0; i < 32; i++){

			// get test bit
			Bit = * inCoded & 0b10000000000000000000000000000000;

			// move to the next Bit
			* inCoded = * inCoded << 1;

			if(Bit)
				node = node->right;
			else
				node = node->left;

			// if end point32_t the output code, else keep traversing the tree
			if(node->type == LEAF){

				// get decoded char and reset the algorithm
				* outRaw++ = node->marker; node = Tree;

				// quit decoding if we have completed
				if(NrData-- == 0)
					return;
			}
		}
		inCoded++;
	}
}

void huffman_encoder(uint8_t * inRaw, uint32_t DataSize, char * outCoded){

  	struct Huffman_Node * tree;
  	uint32_t codeTable1[256] = {0};

	outCoded = rtos_malloc(DataSize);

	tree = build_huffman_graph(inRaw, DataSize);
	build_huffman_code_table(codeTable1, tree, 0);

	uint32_t test = encode(inRaw, (uint32_t *)outCoded, codeTable1, DataSize);
	c_printf("\n\r[g]compressed size %i - %i\n\r", (int)DataSize, (int)test);

	rtos_realloc(outCoded, test);

	memset(inRaw, 0x00, DataSize);
	decoder((uint32_t *)outCoded, inRaw, tree, DataSize);

	for(int32_t i = 0; i < DataSize; i++, inRaw++)
		c_printf("%c", * inRaw);
}

I hope someone can spot the error.

 

Thanks Freaks.

 

compressed size 4984 - 702
Did it make a risky choice? Yes definitely. Is it better off? Definitely not, no
, and recent economic data show that pretty clearly. Comparing Q4 2019 (before t
he pandemic hit) to Q1 2022 (after it) only two regions are back above pre-pande
mic levels, London and Northern Ireland of Protocol fame. All other regions are
in the negative, some like the Western Midlands by some -10% of GDP. It is reall
y hard to call that “better offâ€ed. The numbers are really not there. Of cour
se, you may argue that economics is not everything and that it is a small price
to pay for a Global Britain, but first of all the price is not so small and seco
ndly the whole Global Britain thing is a bit of a rather bad joke. Certainly the
 US, China, Japan even Russia are not terribly impressed, And then there is the
EU. Yes, it is still there, despite all the ‘predictions’ to the contrary. I
n fact, it certainly does not look any less united than before. On the contrary.
 It has countries like Croatia and Bulgaria preparing to join the eurozone and a
 slew of candidates wanting, nay, demanding to join. A bit of a problem at times
, but rather a luxury problem if you consider the state of the Union of that kin
gdom of the isles that is now a third country. Did it makdddteotsdob rssdob rs d
efinitely. Is it better off? Definitely not, no, and recent economic data show t
hat pretty clearly. Comparing Q4 2019 (before the pandemic hit) to Q1 2022 (afte
r it) only two regions are back above pre-pandemic levels, London and Northern I
reland of Protocol fame. All other regions are in the negative, some like the We
stern Midlands by some -10% of GDP. It is really hard to call that “better off
â€ed. The numbers are really not there. Of course, you may argue that economics
is not everything and that it is a small price to pay for a Global Britain, but
first of all the price is not so small and secondly the whole Global Britain thi
ng is a bit of a rather bad joke. Certainly the US, China, Japan even Russia are
 not terribly impressed, And then there is the EU. Yes, it is still there, despi
te all the ‘predictions’ to the contrary. In fact, it certainly does not loo
k any less united than before. On the contrary. It has countries like Croatia an
d Bulgaria preparing to join the eurozone and a slew of candidates wanting, nay,
 demanding to join. A bit of a problem at times, but rather a luxury problem if
you consider the state of the Union of that kingdom of the isles that is now a t
hird country. Did it make a risky choice? Yes definitely. Is it better off? Defi
nitely not, no, and recent economic data show that pretty clearly. Comparing Q4
2019 (before the pandemic hit) to Q1 2022 (after it) only two regions are back a
bove pre-pandemic levels, London and Northern Ireland of Protocol fame. All othe
r regions are in the negative, some like the Western Midlands by some -10% of GD
P. It is really hard to call that “better offâ€ed. The numbers are really not
there. Of course, you may argue that economics is not everything and that it is
a small price to pay for a Global Britain, but first of all the price is not so
small and secondly the whole Global Britain thing is a bit of a rather bad joke.
 Certainly the US, China, Japan even Russia are not terribly impressed, And then
 there is the EU. Yes, it is still there, despite all the ‘predictions’ to t
he contrary. In fact, it certainly does not look any less united than before. On
 the contrary. It has countries like Croatia and Bulgaria preparing to join the
eurozone and a slew of candidates wanting, nay, demanding to join. A bit of a pr
oblem at times, but rather a luxury problem if you consider the state of the Uni
on of that kingdom of the isles that is now a third country. Did it make a risky
 choice? Yes definitely. Is it better off? Definitely not, no, and recent econom
ic data show that pretty clearly. Comparing Q4 2019 (before the pandemic hit) to
 Q1 2022 (after it) only two regions are back above pre-pandemic levels, London
and Northern Ireland of Protocol fame. All other regions are in the negative, so
me like the Western Midlands by some -10% of GDP. It is really hard to call that
 “better offâ€ed. The numbers are really not there. Of course, you may argue t
hat economics is not everything and that it is a small price to pay for a Global
 Britain, but first of all the price is not so small and secondly the whole Glob
al Britain thing is a bit of a rather bad joke. Certainly the US, China, Japan e
ven Russia are not terribly impressed, And then there is the EU. Yes, it is stil
l there, despite all the ‘predictions’ to the contrary. In fact, it certainl
y does not look any less united than before. On the contrary. It has countries l
ike Croatia and Bulgaria preparing to join the eurozone and a slew of candidates
 wanting, nay, demanding to join. A bit of a problem at times, but rather a luxu
ry problem if you consider the state of the Union of that kingdom of the isles t
hat is now a third count

 

This topic has a solution.
Last Edited: Thu. Jun 9, 2022 - 08:52 PM
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

As this Freak's website also has problems with character encoding it's not clear whether what you are describing is your actual problem or  a Freaks encoding thing so can you describe in words (not symbols) what you consider the problem to be? 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0


Have a look at the screen shot where I have highlighted the text area.

 

 

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Check the range on your for loops. There's a few were you only handle 255 of the 256 elements.

  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

balisong42 wrote:

Check the range on your for loops. There's a few were you only handle 255 of the 256 elements.

Do that now.

This reply has been marked as the solution. 
  • 1
  • 2
  • 3
  • 4
  • 5
Total votes: 0

Look at the characters either side of predictions. Your input text contains what are laughingly known as smart quotes rather than ascii quotes. Those smart quotes have a UTF-8 value which is three bytes long... Note that the initial byte of that group will always have bit 7 set (and groups with lengths other than three are quite possible).

 

Neil