数据压缩实验三

程序员文章站 2022-07-14 21:58:17

...

实验原理

一、熵

在信息论中,熵是信息的度量单位。信息论的创始人shannon在其著作《通信的数学理论》中提出了建立在概率统计模型上的信息度量。他把信息定义为“用来消除不确定性的东西”。一般用符号 H 表示,单位是比特。变量的不确定性越大,熵也就越大。换句话说,了解它所需要的信息量也就越大。 Huffman Coding (霍夫曼编码)是一种无失真编码的编码方式,Huffman 编码是可变字长编码(VLC)的一种。Huffman 编码基于信源的概率统计模型,它的基本思路是,出现概率大的信源符号编长码,出现概率小的信源符号编短码,从而使平均码长最小。在程序实现中常使用一种叫做树的数据结构实现 Huffman 编码,由它编出的码是即时码。

二、 Huffman 编码

Huffman Coding (霍夫曼编码)是一种无失真编码的编码方式,Huffman 编码是可变字长编码(VLC)的一种。 Huffman 编码基于信源的概率统计模型,它的基本思路是,出现概率大的信源符号编长码,出现概率小的信源符号编短码,从而使平均码长最小。在程序实现中常使用一种叫做树的数据结构实现 Huffman 编码,由它编出的码是即时码。

实验流程

数据压缩实验三

1、统计符号的发生概率;

2、把频率按从小到大的顺序排列

3、每一次选出最小的两个值,作为二叉树的两个叶子节点,将和作为它们的根节点,这两个叶子节点不再参与比较,新的根节点参与比较;

4、重复 3,直到最后得到和为 1 的根节点;

5、将形成的二叉树的左节点标 0,右节点标 1,把从最上面的根节点到最下面的叶子节点途中遇到的 0,1 序列串起来,就得到了各个符号的编码。

关键代码及其分析

huff code.c

//定义数据类型
//节点
typedef struct huffman_node_tag
{
	unsigned char isLeaf;//判断是否是叶节点
	unsigned long count;//出现次数
	struct huffman_node_tag *parent;


	union
	{
		struct
		{
			struct huffman_node_tag *zero, *one;//左0右1
		};
		unsigned char symbol;
	};
} huffman_node;
//码字数据类型
typedef struct huffman_code_tag
{
	/* The length of this code in bits. */
	unsigned long numbits;//长度


	/* The bits that make up this code. The first
	   bit is at position 0 in bits[0]. The second
	   bit is at position 1 in bits[0]. The eighth
	   bit is at position 7 in bits[0]. The ninth
	   bit is at position 0 in bits[1]. */
	unsigned char *bits;
} huffman_code;

static unsigned long
numbytes_from_numbits(unsigned long numbits)
{
	return numbits / 8 + (numbits % 8 ? 1 : 0);//确定字节数
}


/*
* get_bit returns the ith bit in the bits array
* in the 0th position of the return value.
*/
static unsigned char
get_bit(unsigned char* bits, unsigned long i)
{
	return (bits[i / 8] >> i % 8) & 1;
	//i/8取整，i%8取余，表示第几字节第几位
}
//码字逆序
static void
reverse_bits(unsigned char* bits, unsigned long numbits)
{
	unsigned long numbytes = numbytes_from_numbits(numbits);//码字占用字节数
	unsigned char *tmp =
		(unsigned char*)alloca(numbytes);//开辟空间
	unsigned long curbit;
	long curbyte = 0;


	memset(tmp, 0, numbytes);


	for (curbit = 0; curbit < numbits; ++curbit)
	{
		unsigned int bitpos = curbit % 8;


		if (curbit > 0 && curbit % 8 == 0)//判断当前位在字节中的位数
			++curbyte;//到下一字节


		tmp[curbyte] |= (get_bit(bits, numbits - curbit - 1) << bitpos);//从后往前取每一位再移位
	}


	memcpy(bits, tmp, numbytes);
}



//新建码字
static huffman_code*
new_code(const huffman_node* leaf)
{
	/* Build the huffman code by walking up to
	* the root node and then reversing the bits,
	* since the Huffman code is calculated by
	* walking down the tree. */
	unsigned long numbits = 0;//码长
	unsigned char* bits = NULL;//存储码字的数组
	huffman_code *p;
	while (leaf && leaf->parent)//不是根节点
	{
		huffman_node *parent = leaf->parent;//得到码字位置和码字的字节数
		unsigned char cur_bit = (unsigned char)(numbits % 8);
		unsigned long cur_byte = numbits / 8;


		/* If we need another byte to hold the code,
		then allocate it. */
		if (cur_bit == 0)
		{
			size_t newSize = cur_byte + 1;
			bits = (char*)realloc(bits, newSize);//新建字节保存高位的码字
			bits[newSize - 1] = 0;//新增字节初始化为0
		}


		if (leaf == parent->one)//是否是右节点
			bits[cur_byte] |= 1 << cur_bit;//将当前字节设为1


		++numbits;//码长++
		leaf = parent;//将下一个节点挪到父节点
	}


	if (bits)
		reverse_bits(bits, numbits);//码字逆序


	p = (huffman_code*)malloc(sizeof(huffman_code));
	p->numbits = numbits;
	p->bits = bits;
	return p;
}


#define MAX_SYMBOLS 256
typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS];//信源符号数组
typedef huffman_code* SymbolEncoder[MAX_SYMBOLS];//编码后符号数组


//新建结点
static huffman_node*
new_leaf_node(unsigned char symbol)
{
	huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));//开辟空间
	p->isLeaf = 1;
	//为叶结点
	p->symbol = symbol;
	p->count = 0;
	p->parent = 0;
	return p;
}
//建立内部节点
static huffman_node*
new_nonleaf_node(unsigned long count, huffman_node *zero, huffman_node *one)
{
	huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));
	p->isLeaf = 0;//不是叶节点
	p->count = count;
	p->zero = zero;
	p->one = one;
	p->parent = 0;


	return p;
}


//统计信源符号概率
static unsigned int
get_symbol_frequencies(SymbolFrequencies *pSF, FILE *in)
{
	int c;
	unsigned int total_count = 0;
	/* Set all frequencies to 0. */
	init_frequencies(pSF);//初始化频率为0




	while ((c = fgetc(in)) != EOF)//读取文件中每个信源符号
	{
		unsigned char uc = c;
		if (!(*pSF)[uc])
			(*pSF)[uc] = new_leaf_node(uc);//若没有此符号，建立新节点
		++(*pSF)[uc]->count;//累计发生次数
		++total_count;//累计信源发生次数
	}


	return total_count;
}

//将节点按出现概率从小到大排序，qsort函数中用到
static int
SFComp(const void *p1, const void *p2)
{
	const huffman_node *hn1 = *(const huffman_node**)p1;
	const huffman_node *hn2 = *(const huffman_node**)p2;
	//将所有空节点排序 
	if (hn1 == NULL && hn2 == NULL) //两个节点都为空
		return 0;//返回0
	if (hn1 == NULL)//第一个节点为空，第二个节点大
		return 1;//返回1
	if (hn2 == NULL)//第二个节点空，第一个节点大
		return -1;//返回-1


	if (hn1->count > hn2->count)//两个节点都不为空，比较count值，1大于2
		return 1;//返回1
	else if (hn1->count < hn2->count)//1小于2
		return -1;//返回-1


	return 0;
}
/*
* build_symbol_encoder builds a SymbolEncoder by walking
* down to the leaves of the Huffman tree and then,
* for each leaf, determines its code.
*/
//生成码字
static void
build_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSF)
{
	if (subtree == NULL)//树为空则返回
		return;


	if (subtree->isLeaf)//叶节点进行编码
		(*pSF)[subtree->symbol] = new_code(subtree);
	else//不是叶节点先走左节点
	{
		build_symbol_encoder(subtree->zero, pSF);
		build_symbol_encoder(subtree->one, pSF);
	}
}
/*
* calculate_huffman_codes turns pSF into an array
* with a single entry that is the root of the
* huffman tree. The return value is a SymbolEncoder,
* which is an array of huffman codes index by symbol value.
*/
//建立Huffman树
static SymbolEncoder*
calculate_huffman_codes(SymbolFrequencies * pSF)
{
	unsigned int i = 0;
	unsigned int n = 0;
	huffman_node *m1 = NULL, *m2 = NULL;
	SymbolEncoder *pSE = NULL;


#if 1
	printf("BEFORE SORT\n");
	print_freqs(pSF);
#endif
	qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp);//使用qshort函数将信源符号按出现频率大小排序，下标小的在前


#if 1	
	printf("AFTER SORT\n");
	print_freqs(pSF);
#endif
	for (n = 0; n < MAX_SYMBOLS && (*pSF)[n]; ++n)//统计种类总数，一个字节8bit，共256种
		;


	/*
	* Construct a Huffman tree. This code is based
	* on the algorithm given in Managing Gigabytes
	* by Ian Witten et al, 2nd edition, page 34.
	* Note that this implementation uses a simple
	* count instead of probability.
	*/
	for (i = 0; i < n - 1; ++i)
	{
		m1 = (*pSF)[0];
		m2 = (*pSF)[1];//将出现次数最少的两个节点设为m1,m2


		/* Replace m1 and m2 with a set {m1, m2} whose probability
		* is the sum of that of m1 and m2. */
		(*pSF)[0] = m1->parent = m2->parent =
			new_nonleaf_node(m1->count + m2->count, m1, m2);
		(*pSF)[1] = NULL;
		qsort((*pSF), n, sizeof((*pSF)[0]), SFComp);//重新排序
	}


	/* Build the SymbolEncoder array from the tree. */
	pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder));
	memset(pSE, 0, sizeof(SymbolEncoder));
	build_symbol_encoder((*pSF)[0], pSE);//从树根开始构建码字
	return pSE;
}
/*写入码表*/
static int
write_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count)
{
	unsigned long i, count = 0;


	/* Determine the number of entries in se. */
	for (i = 0; i < MAX_SYMBOLS; ++i)//统计码字种类
	{
		if ((*se)[i])
			++count;
	}


	/* Write the number of entries in network byte order. */
	i = htonl(count);    //在网络传输中，采用big-endian序，对于0x0A0B0C0D ，传输顺序就是0A 0B 0C 0D ，
	//因此big-endian作为network byte order，little-endian作为host byte order。
	//little-endian的优势在于unsigned char/short/int/long类型转换时，存储位置无需改变
	if (fwrite(&i, sizeof(i), 1, out) != 1)
		return 1;


	/* Write the number of bytes that will be encoded. */
	symbol_count = htonl(symbol_count);
	if (fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1)
		return 1;


	/* Write the entries. */
	for (i = 0; i < MAX_SYMBOLS; ++i)//写入码表
	{
		huffman_code *p = (*se)[i];
		if (p)
		{
			unsigned int numbytes;
			/* Write the 1 byte symbol. */
			fputc((unsigned char)i, out);//写入字节符号
			/* Write the 1 byte code bit length. */
			fputc(p->numbits, out);//写入码长
			/* Write the code bytes. */
			numbytes = numbytes_from_numbits(p->numbits);//写入码字
			if (fwrite(p->bits, 1, numbytes, out) != numbytes)
				return 1;
		}
	}


	return 0;
}
//对文件解码，读取码表
static huffman_node*
read_code_table(FILE* in, unsigned int *pDataBytes)
{
	huffman_node *root = new_nonleaf_node(0, NULL, NULL);
	unsigned int count;


	/* Read the number of entries.
	(it is stored in network byte order). */
	if (fread(&count, sizeof(count), 1, in) != 1)//读取码表中的符号数
	{
		free_huffman_tree(root);
		return NULL;
	}


	count = ntohl(count);


	/* Read the number of data bytes this encoding represents. */
	if (fread(pDataBytes, sizeof(*pDataBytes), 1, in) != 1)
	{
		free_huffman_tree(root);
		return NULL;
	}


	*pDataBytes = ntohl(*pDataBytes);




	/* Read the entries. */
	while (count-- > 0)//读取码表
	{
		int c;
		unsigned int curbit;
		unsigned char symbol;
		unsigned char numbits;
		unsigned char numbytes;
		unsigned char *bytes;
		huffman_node *p = root;


		if ((c = fgetc(in)) == EOF)//读入
		{
			free_huffman_tree(root);
			return NULL;
		}
		symbol = (unsigned char)c;


		if ((c = fgetc(in)) == EOF)
		{
			free_huffman_tree(root);
			return NULL;
		}


		numbits = (unsigned char)c;
		numbytes = (unsigned char)numbytes_from_numbits(numbits);//计算存储一个码长需要多少字节
		bytes = (unsigned char*)malloc(numbytes);//开辟空间
		if (fread(bytes, 1, numbytes, in) != numbytes)//读取码字
		{
			free(bytes);
			free_huffman_tree(root);
			return NULL;
		}
/*
		* Add the entry to the Huffman tree. The value
		* of the current bit is used switch between
		* zero and one child nodes in the tree. New nodes
		* are added as needed in the tree.
		*/
		for (curbit = 0; curbit < numbits; ++curbit)
		{
			if (get_bit(bytes, curbit))//为1建右节点
			{
				if (p->one == NULL)
				{
					p->one = curbit == (unsigned char)(numbits - 1)
						? new_leaf_node(symbol)
						: new_nonleaf_node(0, NULL, NULL);
					p->one->parent = p;
					//如果是最后一位，则建立树叶节点,不是则建立非叶节点

				}
				p = p->one;
			}
			else//建立左节点
			{
				if (p->zero == NULL)
				{
					p->zero = curbit == (unsigned char)(numbits - 1)
						? new_leaf_node(symbol)
						: new_nonleaf_node(0, NULL, NULL);
					p->zero->parent = p;
				}
				p = p->zero;
			}
		}


		free(bytes);
	}


	return root;//返回根节点
}
//解码
int
huffman_decode_file(FILE *in, FILE *out)
{
	huffman_node *root, *p;
	int c;
	unsigned int data_count;


	root = read_code_table(in, &data_count);//读取码表
	if (!root)
		return 1;

	p = root;
	while (data_count > 0 && (c = fgetc(in)) != EOF)
	{
		unsigned char byte = (unsigned char)c;
		unsigned char mask = 1;//逐位读码字
		while (data_count > 0 && mask)
		{
			p = byte & mask ? p->one : p->zero;
			mask <<= 1;//左移1位


			if (p->isLeaf)
			{
				fputc(p->symbol, out);//输出叶节点存储符号
				p = root;//转到根节点
				--data_count;//没解码符号数-——
			}
		}
	}


	free_huffman_tree(root);
	return 0;
｝




//huffman.c


main(int argc, char** argv)
{
	char memory = 0;//1对内存数据进行操作，0不操作
	char compress = 1;//1解码，0编码
	int opt;
	const char *file_in = NULL, *file_out = NULL;
	//step1:add by yzhang for huffman statistics
	const char *file_out_table = NULL;
	//end by yzhang
	FILE *in = stdin;
	FILE *out = stdout;
	//step1:add by yzhang for huffman statistics
	FILE * outTable = NULL;
	//end by yzhang


	/* Get the command line arguments. */
	while((opt = getopt(argc, argv, "i:o:cdhvmt:")) != -1) //读取命令行参数
	{
		switch(opt)
		{
		case 'i':
			file_in = optarg;//输入文件
			break;
		case 'o':
			file_out = optarg;//输出文件
			break;
		case 'c':
			compress = 1;//编码
			break;
		case 'd':
			compress = 0;//解码
			break;
		case 'h':
			usage(stdout);//输出参数用法的说明
			return 0;
		case 'v':
			version(stdout);//输出版本号的信息
			return 0;
		case 'm':
			memory = 1;//对内存数据进行操作
			break;
		// by yzhang for huffman statistics
		case 't':
			file_out_table = optarg;//输出中间数据信息			
			break;
		//end by yzhang
		default:
			usage(stderr);
			return 1;
		}
	}


	if(file_in)//输入文件
	{
		in = fopen(file_in, "rb");
		if(!in)
		{
			fprintf(stderr,
					"Can't open input file '%s': %s\n",
					file_in, strerror(errno));
			return 1;
		}
	}


	/* If an output file is given then create it. */
	if(file_out)//创建输出文件
	{
		out = fopen(file_out, "wb");
		if(!out)
		{
			fprintf(stderr,
					"Can't open output file '%s': %s\n",
					file_out, strerror(errno));
			return 1;
		}
	}


	。。。

}

实验结果




观察可得平均码长和熵的值很接近，与此同时如果文件过小，则很有可能出现huffman编码之后文件变大的情况，由于还有码表的存在，很占空间。

上一篇：隐藏数据的好方法----Alternative Data Stream(可选数据流)

下一篇：数据压缩试验三：LZW 编码原理与C语言实现

数据压缩实验三

实验原理

一、熵

实验流程

android图像绘制(三)画布刷屏问题记录

Android仿微信调用第三方地图应用导航（高德、百度、腾讯）

CDR快速绘制漂亮的立体三维线条

世界十大最强昆虫蝗虫上榜，第三能自己转化电能

CorelDRAW中将三个或多个对象的对齐方式介绍

教您怎么用三步轻松完成软文推广

Android第三方登录之QQ登录

浅谈Android 中图片的三级缓存策略

python自定义函数实现一个数的三次方计算方法

企业营销型网站做不好可能在这三方面出现问题

数据压缩实验三

实验原理

一、熵

实验流程

android图像绘制(三)画布刷屏问题记录

Android仿微信调用第三方地图应用导航（高德、百度、腾讯）

CDR快速绘制漂亮的立体三维线条

世界十大最强昆虫 蝗虫上榜，第三能自己转化电能

CorelDRAW中将三个或多个对象的对齐方式介绍

教您怎么用三步轻松完成软文推广

Android第三方登录之QQ登录

浅谈Android 中图片的三级缓存策略

python自定义函数实现一个数的三次方计算方法

企业营销型网站做不好 可能在这三方面出现问题

世界十大最强昆虫蝗虫上榜，第三能自己转化电能

企业营销型网站做不好可能在这三方面出现问题