数据压缩实验三：Huffman编解码

程序员文章站 2022-07-14 22:17:30

...

一、基本原理

1.Huffman编码算法

（）统计频率：将文件以ASCII字符流的形式读入，统计每个符号的发生频率；

（2）排序：将所有文件中出现过的字符按照频率从小到大的顺序排列；

（3）每一次选出最小的两个值，作为二叉树的两个叶子节点，将和作为它们的根节点，这两个叶子节点不再参与比较，新的根节点参与比较；重复此步骤，直到最后得到和为1的根节点；

（4）编码：将形成的二叉树的左节点标0，右节点标1，把从最上面的根节点到最下面的叶子节点途中遇到的0、1序列串起来，得到了各个字符的编码表示。

2.Huffman编码的数据结构设计

（1）Huffman节点结构

typedef struct huffman_node_tag
{
	unsigned char isLeaf;//是否为叶节点，1是0不是
	unsigned long count;//信源中出现频数
	struct huffman_node_tag *parent;//父节点指针

	union
	{
		struct//如果不是节结点，则此项为该节点左右子节点的指针
		{
			struct huffman_node_tag *zero, *one;//指向子节点，左0右1
		};
		unsigned char symbol;//如果是叶节点，则为某个信源符号
	};
} huffman_node;

（2）Huffman码结构

typedef struct huffman_code_tag
{
	/* The length of this code in bits. */
	unsigned long numbits;//码字的长度

	/* 码字的第1位存于bits[0]的第1位，
	码字的第2位存于bits[0]的第2位，
	码字的第8位存于bits[0]的第8位，
	码字的第9位存于bits[1]的第1位*/
	unsigned char *bits;
} huffman_code;

二、实验流程及代码分析

1.Huffman编码流程

（1）读入待编码的源文件

int
main(int argc, char** argv)
{
	char memory = 0;//是否对内存数据进行操作，1操作，0不操作
	char compress = 1;//compress为1解码，0编码
	int opt;
	const char *file_in = NULL, *file_out = NULL;
	//step1:add by yzhang for huffman statistics
	const char *file_out_table = NULL;
	//end by yzhang
	FILE *in = stdin;
	FILE *out = stdout;
	//step1:add by yzhang for huffman statistics
	FILE * outTable = NULL;
	//end by yzhang

	/* Get the command line arguments. */
	while((opt = getopt(argc, argv, "i:o:cdhvmt:")) != -1) //读取命令行参数
	{
		switch(opt)
		{
		case 'i':
			file_in = optarg;//i 输入文件
			break;
		case 'o':
			file_out = optarg;//o 输出文件
			break;
		case 'c':
			compress = 1;//c 编码
			break;
		case 'd':
			compress = 0;//d 解码
			break;
		case 'h':
			usage(stdout);//h 输出参数用法的说明
			return 0;
		case 'v':
			version(stdout);//v 输出版本号的信息
			return 0;
		case 'm':
			memory = 1;//m 对内存数据进行操作
			break;
		// by yzhang for huffman statistics
		case 't':
			file_out_table = optarg;//t 输出中间数据信息			
			break;
		//end by yzhang
		default:
			usage(stderr);
			return 1;
		}
	}

	/* If an input file is given then open it. */
	if(file_in)//读取输入文件
	{
		in = fopen(file_in, "rb");
		if(!in)
		{
			fprintf(stderr,
					"Can't open input file '%s': %s\n",
					file_in, strerror(errno));
			return 1;
		}
	}

	/* If an output file is given then create it. */
	if(file_out)//创建输出文件
	{
		out = fopen(file_out, "wb");
		if(!out)
		{
			fprintf(stderr,
					"Can't open output file '%s': %s\n",
					file_out, strerror(errno));
			return 1;
		}
	}

	//by yzhang for huffman statistics
	if(file_out_table)
	{
		outTable = fopen(file_out_table, "w");
		if(!outTable)
		{
			fprintf(stderr,
				"Can't open output file '%s': %s\n",
				file_out_table, strerror(errno));
			return 1;
		}
	}
	//end by yzhang

	if(memory)//对内存数据进行编解码操作
	{
		return compress ?
			memory_encode_file(in, out) : memory_decode_file(in, out);
	}

	if(compress)  //change by yzhang
		huffman_encode_file(in, out,outTable);//step1:changed by yzhang from huffman_encode_file(in, out) to huffman_encode_file(in, out,outTable)
	else
	huffman_decode_file(in, out);

	if(in)
		fclose(in);
	if(out)
		fclose(out);
	if(outTable)
		fclose(outTable);
	return 0;
}

（2）第一次扫描：统计文件中各个字符出现频率

/*统计信源符号出现概率*/
static unsigned int
get_symbol_frequencies(SymbolFrequencies *pSF, FILE *in)
{
	int c;
	unsigned int total_count = 0;//初始化信源符号总数使之为0

	/* Set all frequencies to 0. */
	init_frequencies(pSF);//初始化频率为0

	/* Count the frequency of each symbol in the input file. */
	while ((c = fgetc(in)) != EOF)//读取文件中每个信源符号
	{
		unsigned char uc = c;
		if (!(*pSF)[uc])
			(*pSF)[uc] = new_leaf_node(uc);//若没有此符号，建立新节点
		++(*pSF)[uc]->count;//字符发生频数+1
		++total_count;//总信源符号数+1
	}

	return total_count;
}

新建节点函数：new_leaf_node

/*新建结点*/
static huffman_node*
new_leaf_node(unsigned char symbol)
{
	huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));//开辟空间
	p->isLeaf = 1;//初始化为叶结点
	p->symbol = symbol;
	p->count = 0;
	p->parent = 0;
	return p;
}

（3）建立Huffman树

/*建立Huffman树*/
static SymbolEncoder*
calculate_huffman_codes(SymbolFrequencies * pSF)
{
	unsigned int i = 0;
	unsigned int n = 0;
	huffman_node *m1 = NULL, *m2 = NULL;
	SymbolEncoder *pSE = NULL;

#if 1
	printf("BEFORE SORT\n");
	print_freqs(pSF);
#endif

	/* Sort the symbol frequency array by ascending frequency. */
	qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp);//使用qshort函数将信源符号按出现频率大小排序，下标小的在前

#if 1	
	printf("AFTER SORT\n");
	print_freqs(pSF);
#endif

	/* Get the number of symbols. */
	for (n = 0; n < MAX_SYMBOLS && (*pSF)[n]; ++n)//统计种类总数，一个字节8bit，共256种
		;

	/*
	* Construct a Huffman tree. This code is based
	* on the algorithm given in Managing Gigabytes
	* by Ian Witten et al, 2nd edition, page 34.
	* Note that this implementation uses a simple
	* count instead of probability.
	*/
	for (i = 0; i < n - 1; ++i)
	{
		/* Set m1 and m2 to the two subsets of least probability. */
		m1 = (*pSF)[0];
		m2 = (*pSF)[1];//将出现次数最少的两个节点设为m1,m2

		/* Replace m1 and m2 with a set {m1, m2} whose probability
		* is the sum of that of m1 and m2. */
		(*pSF)[0] = m1->parent = m2->parent =
			new_nonleaf_node(m1->count + m2->count, m1, m2);
		(*pSF)[1] = NULL;//合并m1,m2

		/* Put newSet into the correct count position in pSF. */
		qsort((*pSF), n, sizeof((*pSF)[0]), SFComp);//重新排序
	}

	/* Build the SymbolEncoder array from the tree. */
	pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder));
	memset(pSE, 0, sizeof(SymbolEncoder));
	build_symbol_encoder((*pSF)[0], pSE);//从树根开始构建码字
	return pSE;
}

排序函数：SFComp

/*将节点按出现概率从小到大排序，qsort函数中用到*/
static int
SFComp(const void *p1, const void *p2)
{
	const huffman_node *hn1 = *(const huffman_node**)p1;
	const huffman_node *hn2 = *(const huffman_node**)p2;

	/*将所有空节点排序 */
	if (hn1 == NULL && hn2 == NULL) //两个节点都为空
		return 0;//返回0
	if (hn1 == NULL)//第一个节点为空，第二个节点大
		return 1;//返回1
	if (hn2 == NULL)//第二个节点空，第一个节点大
		return -1;//返回-1

	if (hn1->count > hn2->count)//两个节点都不为空，比较count值，1大于2
		return 1;//返回1
	else if (hn1->count < hn2->count)//1小于2
		return -1;//返回-1

	return 0;
}

建立内部节点函数：new_nonleaf_node

/*建立内部节点*/
static huffman_node*
new_nonleaf_node(unsigned long count, huffman_node *zero, huffman_node *one)
{
	huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));
	p->isLeaf = 0;//不是叶节点
	p->count = count;
	p->zero = zero;
	p->one = one;
	p->parent = 0;

	return p;
}

遍历Huffman树，生成码字

/*遍历Huffman树，生成码字*/
static void
build_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSF)
{
	if (subtree == NULL)//树为空则返回
		return;

	if (subtree->isLeaf)//是叶节点，则进行编码
		(*pSF)[subtree->symbol] = new_code(subtree);//
	else//不是叶节点，则先走左节点，再走右节点
	{
		build_symbol_encoder(subtree->zero, pSF);//递归
		build_symbol_encoder(subtree->one, pSF);
	}
}

新建码字函数：new_code

/*新建码字*/
static huffman_code*
new_code(const huffman_node* leaf)
{
	/* Build the huffman code by walking up to
	* the root node and then reversing the bits,
	* since the Huffman code is calculated by
	* walking down the tree. */
	unsigned long numbits = 0;//码长
	unsigned char* bits = NULL;//存储码字的数组
	huffman_code *p;

	while (leaf && leaf->parent)//从下到上，不是根节点时进入循环
	{
		huffman_node *parent = leaf->parent;//得到码字位置和码字的字节数
		unsigned char cur_bit = (unsigned char)(numbits % 8);
		unsigned long cur_byte = numbits / 8;

		/* If we need another byte to hold the code,
		then allocate it. */
		if (cur_bit == 0)
		{
			size_t newSize = cur_byte + 1;
			bits = (char*)realloc(bits, newSize);//新建字节保存高位的码字
			bits[newSize - 1] = 0;//新增字节初始化为0
		}

		/* If a one must be added then or it in. If a zero
		* must be added then do nothing, since the byte
		* was initialized to zero. */
		if (leaf == parent->one)//是否是右节点
			bits[cur_byte] |= 1 << cur_bit;//将当前字节设为1

		++numbits;//码长+1
		leaf = parent;//将下一个节点挪到父节点
	}

	if (bits)
		reverse_bits(bits, numbits);//码字逆序

	p = (huffman_code*)malloc(sizeof(huffman_code));
	p->numbits = numbits;
	p->bits = bits;
	return p;
}

码字逆序函数：reverse_bits

/*码字逆序*/
static void
reverse_bits(unsigned char* bits, unsigned long numbits)
{
	unsigned long numbytes = numbytes_from_numbits(numbits);//码字占用字节数
	unsigned char *tmp =
		(unsigned char*)alloca(numbytes);//开辟存储空间
	unsigned long curbit;
	long curbyte = 0;

	memset(tmp, 0, numbytes);

	for (curbit = 0; curbit < numbits; ++curbit)
	{
		unsigned int bitpos = curbit % 8;

		if (curbit > 0 && curbit % 8 == 0)//判断当前位在字节中的位数，到下一字节则字节数+1
			++curbyte;

		tmp[curbyte] |= (get_bit(bits, numbits - curbit - 1) << bitpos);//从后往前取每一位再移位
	}

	memcpy(bits, tmp, numbytes);
}

判断字节数函数：numbytes_from_numbits

static unsigned long
numbytes_from_numbits(unsigned long numbits)
{
	return numbits / 8 + (numbits % 8 ? 1 : 0);//确定字节数
}

get_bit返回位数返回值的第 i/8 个字节的第 i%8 位

static unsigned char
get_bit(unsigned char* bits, unsigned long i)
{
	return (bits[i / 8] >> i % 8) & 1;//i/8取整，i%8取余，表示第几字节第几位
}

（4）将码表及其他必要信息写入输出文件

/*写入码表*/
static int
write_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count)
{
	unsigned long i, count = 0;

	/* Determine the number of entries in se. */
	for (i = 0; i < MAX_SYMBOLS; ++i)//统计码字种类
	{
		if ((*se)[i])
			++count;
	}

	/* Write the number of entries in network byte order. */
	i = htonl(count);    //在网络传输中，采用big-endian序，对于0x0A0B0C0D ，传输顺序就是0A 0B 0C 0D ，
	//因此big-endian作为network byte order，little-endian作为host byte order。
	//little-endian的优势在于unsigned char/short/int/long类型转换时，存储位置无需改变
	if (fwrite(&i, sizeof(i), 1, out) != 1)
		return 1;

	/* Write the number of bytes that will be encoded. */
	symbol_count = htonl(symbol_count);
	if (fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1)
		return 1;

	/* Write the entries. */
	for (i = 0; i < MAX_SYMBOLS; ++i)//写入码表
	{
		huffman_code *p = (*se)[i];
		if (p)
		{
			unsigned int numbytes;
			/* Write the 1 byte symbol. */
			fputc((unsigned char)i, out);//写入字节符号
			/* Write the 1 byte code bit length. */
			fputc(p->numbits, out);//写入码长
			/* Write the code bytes. */
			numbytes = numbytes_from_numbits(p->numbits);//写入码字
			if (fwrite(p->bits, 1, numbytes, out) != numbytes)
				return 1;
		}
	}

	return 0;
}

（5）第二次扫描：对源文件进行编码并输出

/*第二次扫描，对文件进行编码*/
static int
do_file_encode(FILE* in, FILE* out, SymbolEncoder *se)
{
	unsigned char curbyte = 0;
	unsigned char curbit = 0;
	int c;

	while ((c = fgetc(in)) != EOF)
	{
		unsigned char uc = (unsigned char)c;
		huffman_code *code = (*se)[uc];//查表
		unsigned long i;

		for (i = 0; i < code->numbits; ++i)//将码字写入文件
		{
			/* Add the current bit to curbyte. */
			curbyte |= get_bit(code->bits, i) << curbit;//把当前比特位加到编码字节的相应位置  

			/* If this byte is filled up then write it
			* out and reset the curbit and curbyte. */
			if (++curbit == 8)
			{
				fputc(curbyte, out);
				curbyte = 0;
				curbit = 0;
			}
		}
	}

2.Huffman解码流程

（1）读入解码文件，提取必要信息，依照码表重建Huffman树

/*对文件解码，读取码表*/
static huffman_node*
read_code_table(FILE* in, unsigned int *pDataBytes)
{
	huffman_node *root = new_nonleaf_node(0, NULL, NULL);
	unsigned int count;

	/* Read the number of entries.
	(it is stored in network byte order). */
	if (fread(&count, sizeof(count), 1, in) != 1)//读取码表中的符号数
	{
		free_huffman_tree(root);
		return NULL;
	}

	count = ntohl(count);

	/* Read the number of data bytes this encoding represents. */
	if (fread(pDataBytes, sizeof(*pDataBytes), 1, in) != 1)
	{
		free_huffman_tree(root);
		return NULL;
	}

	*pDataBytes = ntohl(*pDataBytes);


	/* Read the entries. */
	while (count-- > 0)//读取码表：符号，码长，码字
	{
		int c;
		unsigned int curbit;
		unsigned char symbol;
		unsigned char numbits;
		unsigned char numbytes;
		unsigned char *bytes;
		huffman_node *p = root;

		if ((c = fgetc(in)) == EOF)//一个字节一个字节地读入
		{
			free_huffman_tree(root);
			return NULL;
		}
		symbol = (unsigned char)c;

		if ((c = fgetc(in)) == EOF)
		{
			free_huffman_tree(root);
			return NULL;
		}

		numbits = (unsigned char)c;
		numbytes = (unsigned char)numbytes_from_numbits(numbits);//计算存储一个码长需要多少字节
		bytes = (unsigned char*)malloc(numbytes);//开辟空间
		if (fread(bytes, 1, numbytes, in) != numbytes)//读取码字
		{
			free(bytes);
			free_huffman_tree(root);
			return NULL;
		}

		/*
		* Add the entry to the Huffman tree. The value
		* of the current bit is used switch between
		* zero and one child nodes in the tree. New nodes
		* are added as needed in the tree.
		*/
		for (curbit = 0; curbit < numbits; ++curbit)//根据码表建立码树
		{
			if (get_bit(bytes, curbit))//当前位为1建立右节点
			{
				if (p->one == NULL)
				{
					//如果是最后一位，则建立树叶节点,不是则建立非叶节点
					p->one = curbit == (unsigned char)(numbits - 1)
						? new_leaf_node(symbol)
						: new_nonleaf_node(0, NULL, NULL);
					p->one->parent = p;
				}
				p = p->one;
			}
			else//否则建立左节点
			{
				if (p->zero == NULL)
				{
					p->zero = curbit == (unsigned char)(numbits - 1)
						? new_leaf_node(symbol)
						: new_nonleaf_node(0, NULL, NULL);
					p->zero->parent = p;
				}
				p = p->zero;
			}
		}

		free(bytes);
	}

	return root;//返回根节点
}

（2）根据Huffman树进行解码

/*解码过程*/
int
huffman_decode_file(FILE *in, FILE *out)
{
	huffman_node *root, *p;
	int c;
	unsigned int data_count;

	/* Read the Huffman code table. */
	root = read_code_table(in, &data_count);//读取码表
	if (!root)
		return 1;

	/*文件解码 */
	p = root;
	while (data_count > 0 && (c = fgetc(in)) != EOF)
	{
		unsigned char byte = (unsigned char)c;
		unsigned char mask = 1;//逐位读码字
		while (data_count > 0 && mask)
		{
			p = byte & mask ? p->one : p->zero;
			mask <<= 1;//左移1位

			if (p->isLeaf)
			{
				fputc(p->symbol, out);//输出叶节点存储符号
				p = root;//转到根节点
				--data_count;//没解码符号数-1
			}
		}
	}

	free_huffman_tree(root);
	return 0;
}

三、实验结果

（1）表格形式

数据压缩实验三：Huffman编解码

注意：计算平均码长时不是仅仅算码长的平均值，而要用概率*码长再求和。

由上表可以发现，Huffman编码的平均码长略大于信源熵但很接近。

（2）各样本文件的概率分布图

数据压缩实验三：Huffman编解码

由概率分布图和表格比较得出，概率分布越均匀（如例pptx文件），压缩比越低，压缩效果越差。

上一篇： Netty——编解码器

下一篇：【机器人学】机器人开源项目KDL源码学习：（4）机械臂逆动力学的牛顿欧拉算法

数据压缩实验三：Huffman编解码

一、基本原理

1.Huffman编码算法

2.Huffman编码的数据结构设计

二、实验流程及代码分析

1.Huffman编码流程

2.Huffman解码流程

三、实验结果

（1）表格形式

（2）各样本文件的概率分布图