characters and huffman

7 posts / 0 new
Author
Message

Guys, I've created a Huffman compression algorithm and it functions correctly with characters, "a,b, c, d, , , z". However, when I try to send data if fails.  Either by freezing or corrupt data.  Anyway, have a quick look at the code.

```#include <stdlib.h>
#include "stdint.h"
#include "string.h"
#include "stdio.h"

/** In computer science and information theory, a Huffman code is a particular
*  type of optimal prefix code that is commonly used for lossless data
*  compression. The process of finding or using such a code proceeds by means
*   of Huffman coding, an algorithm developed by David A. Huffman while he
*   was a Sc.D. student at MIT, and published in the 1952 paper "A Method for
*    the Construction of Minimum-Redundancy Codes".[1]
*/

int SubTrees;

static struct Huffman_Node * retrieve_small_leaf1(struct Huffman_Node * node, int OffSet)
{

for (int i = OffSet; ((i != 0) && (node->value == 0)); i--, node--);
return node;
}

static struct Huffman_Node * retrieve_small_leaf2(struct Huffman_Node * node, int OffSet)
{

for (int i = OffSet; ((i != 0) && (node->value == 0)); i--, node--);
return node - 1;
}

static struct Huffman_Node * build_huffman_graph(unsigned char * inRaw, unsigned int Data){

struct RawDat histogram[256] = {0};

// clear binary histogram
for(int i = 0; i < 255; i++){

histogram[i].repitions = 0;
histogram[i].dat = 0;
}

// derive binary histogram from raw data
for(int i = 0; i < Data; i++, inRaw++){

histogram[(unsigned int)* inRaw].repitions++;
histogram[(unsigned int)* inRaw].dat = * inRaw;
}

// find the number of active sub trees
for(int g = 0; g != 255; g++){
if(histogram[g].repitions != 0){
SubTrees++;
}
}

struct RawDat swap;
int c, d;

// bubble sort the histogram
for (c = 0 ; c < 255 - 1; c++){
for (d = 0 ; d < 255 - 1; d++){

if ((histogram[d].repitions < histogram[d + 1].repitions)){

memcpy(&swap, &histogram[d], sizeof(struct RawDat));
memcpy(&histogram[d], &histogram[d + 1], sizeof(struct RawDat));
memcpy(&histogram[d + 1], &swap, sizeof(struct RawDat));
}
}
}

// fill in the data for each leaf
struct Huffman_Node * node = safe_malloc((sizeof(struct Huffman_Node) * SubTrees));

for (int f = 0; f < SubTrees; f++){

node[f].type	= LEAF;
node[f].value   = histogram[f].repitions;
node[f].marker  = histogram[f].dat;
node[f].left    = 0;
node[f].right   = 0;
}

struct Huffman_Node * LeafOne = NULL;
struct Huffman_Node * LeafTwo = NULL;

int OffSet = SubTrees;

while(SubTrees - 1){

// get the two lowest leafs/branchs
LeafOne = retrieve_small_leaf1(&node[OffSet - 1], OffSet);
LeafTwo = retrieve_small_leaf2(&node[OffSet - 1], OffSet);

struct Huffman_Node * leaf2 = safe_malloc(sizeof(struct Huffman_Node));
memcpy(leaf2, LeafTwo, sizeof(struct Huffman_Node));

// create new branch, this branch is also in contention
LeafTwo->type   = BRANCH;
LeafTwo->value  = LeafOne->value + leaf2->value;
LeafTwo->left   = leaf2;
LeafTwo->right  = LeafOne;
LeafTwo->marker = '#';

// remove from contention
LeafOne->value = 0;

struct Huffman_Node Temp;

// quickly resort the remaining leafs and branches
for (c = 0 ; c < SubTrees - 1; c++){
for (d = 0 ; d < SubTrees - 1; d++){

if (node[d].value < node[d + 1].value){

memcpy(&Temp, &node[d], sizeof(struct Huffman_Node));
memcpy(&node[d], &node[d + 1], sizeof(struct Huffman_Node));
memcpy(&node[d + 1], &Temp, sizeof(struct Huffman_Node));
}
}
}
SubTrees--;
}

// tree is complete, return pointer to the tree
return LeafTwo;
}

/* builds the huffman table with the bits for each entry. 1 stands for binary 0 and 2 for binary 1*/
static void build_huffman_code_table(unsigned int codeTable[], struct Huffman_Node * tree, int Code){

if (tree->type == LEAF){

// assign code to leaf
codeTable[(int)tree->marker] = Code;
}
else{

// traverse the branches, 1 equals 0 and 2 equals 1
build_huffman_code_table(codeTable, tree->left, Code * 10 + 1);
build_huffman_code_table(codeTable, tree->right, Code * 10 + 2);
}
return;
}

static unsigned int mask(unsigned int bits_remaining){

for(int i = 0; i < bits_remaining; i++){
}

}

static unsigned int codify(unsigned int code){

char data[33];

sprintf(data, "%33d", code);

code = 0x00;

for(int i = 32; i != 0; i--){

code = code >> 1;

if(data[i] == '2'){
code |= 0b10000000000000000000000000000000;
}
}

return code;
}

static void encode(unsigned char * inRaw, unsigned int * outCoded, unsigned int * codeTable, unsigned int DataSize){

unsigned char bits_remaining = 32;

char buf[33];
int nr_bits = 0;

while(DataSize--){

// get Huffman code
unsigned int code = codeTable[* inRaw++];

nr_bits = 0;

// calculate the size of the Huffman code
itoa(code, buf, 10); nr_bits = strlen(buf);

// create code byte, in binary
code = codify(code);

// if it fits in this byte then add it to the stream
if(nr_bits <= bits_remaining){

* outCoded = * outCoded << nr_bits;
* outCoded |= code;

bits_remaining -= nr_bits;

// new stream byte required
if(bits_remaining == 0){

bits_remaining = 32;
outCoded++;
}
}

// does not fit in the stream byte
else{

// bytes remaining to add equals i
nr_bits = nr_bits - bits_remaining;

// add the top bits to the stream
* outCoded = * outCoded << bits_remaining;
* outCoded |= code >> nr_bits;

// remaining code to be added to the stream

outCoded++;
bits_remaining = 32;

if(nr_bits <= bits_remaining){

// insert Code into stream
* outCoded = * outCoded << nr_bits;
* outCoded |= code;

// calculate bytes remaining
bits_remaining -= nr_bits;

// new stream byte required
if(bits_remaining == 0){

bits_remaining = 32;
outCoded++;
}
}
}
}

* outCoded = * outCoded << bits_remaining;
}

static void decoder(unsigned int * inCoded, unsigned char * outRaw, struct Huffman_Node * Tree, unsigned int NrData){

unsigned int Bit;
struct Huffman_Node * node = Tree;

while(1){

for(int i = 0; i < 32; i++){

Bit = * inCoded & 0b10000000000000000000000000000000;

// move to the next Bit
* inCoded = * inCoded << 1;

if(Bit)
node = node->right;
else
node = node->left;

if(node->type == LEAF){

// get decoded char and reset the algorithm
* outRaw++ = node->marker; node = Tree;

// quit decoding if we have completed
if(NrData-- == 0)
return;
}
}
inCoded++;
}
}

void huffman_encoder(unsigned char * inRaw, unsigned int DataSize, char ** outCoded9){

struct Huffman_Node * tree;
unsigned int HuffmanTable[256] = {0};

unsigned char  * outCoded = safe_malloc(230500);

tree = build_huffman_graph(inRaw, DataSize);
build_huffman_code_table(HuffmanTable, tree, 0);

encode(inRaw, (unsigned int *)outCoded, HuffmanTable, DataSize);
decoder((unsigned int *)outCoded, inRaw, tree, DataSize);
}

```

How does your encoder tell you how big the compressed output is, and how do you tell the decoder how big the compressed input is?

christop wrote:

How does your encoder tell you how big the compressed output is, and how do you tell the decoder how big the compressed input is?

```
encode(inRaw, (unsigned int *)outCoded, HuffmanTable, DataSize);
decoder((unsigned int *)outCoded, inRaw, tree, DataSize);```

The variable DataSize tells us how big the input and output are.

unsigned char  * outCoded = safe_malloc(230500);

is just a large buffer at the moment to handle the compressed data output.

Last Edited: Tue. Jan 7, 2020 - 07:32 PM

But isn't that the size of the uncompressed data?

I'm not interested in the size of the encoder output.  I've assigned a buffer large enough to hold the entire coded message.

I tell the decoder how big the uncompressed data was and while its decoding we finish when the number of LEAFs matched the DataSize.

```			if(node->type == LEAF){

// get decoded char and reset the algorithm
* outRaw++ = node->marker; node = Tree;

// quit decoding if we have completed
if(NrData-- == 0)
return;
}
```

Last Edited: Tue. Jan 7, 2020 - 07:58 PM