数据压缩 实验三 Huffman编解码算法实现与压缩效率分析
程序员文章站
2022-07-14 22:16:54
...
实验原理
Huffman编码是一种无失真编码方式,是一种可变长编码,它将出现概率大的信源符号短编码,出现概率小的信源符号长编码。
编码步骤:
①将文件以ASCII字符流的形式读入,统计每个符号的发生概率
②将所有文件中出现过的字符按照概率从小到大的顺序排列
③每一次选出最小的两个值,作为二叉树的两个子节点,将和作为他们的父节点,这两个子节点不再参与比较,新的父节点参与比较
④重复上一步,直到最后得到和为1的根节点
⑤将形成的二叉树的左节点标0,右节点标1,把从最上面的根节点到最下面的树叶节点途中遇到的0和1按序串联,即为该字符的编码。
流程分析:
代码分析
实验中将实际完成编码工作的工程Huff_code封装成一个静态链接库,由工程huff_run来调用,huff_run完成的工作包括解析命令行参数,打开、读取、关闭输入文件,打开关闭输出文件,调用Huff_code完成编码。
Huff_run
huffcode.c
#include "huffman.h"
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <stdlib.h>
#include <assert.h>
#ifdef WIN32
#include <malloc.h>
extern int getopt(int, char**, char*);
extern char* optarg;
#else
#include <unistd.h>
#endif
static int memory_encode_file(FILE *in, FILE *out);
static int memory_decode_file(FILE *in, FILE *out);
static void version(FILE *out)
{
fputs("huffcode 0.3\n"
"Copyright (C) 2003 Douglas Ryan Richardson"
"; Gauss Interprise, Inc\n",
out);
}
static void usage(FILE* out)
{
fputs("Usage: huffcode [-i<input file>] [-o<output file>] [-d|-c]\n"
"-i - input file (default is standard input)\n"
"-o - output file (default is standard output)\n"
"-d - decompress\n"
"-c - compress (default)\n"
"-m - read file into memory, compress, then write to file (not default)\n"
"-t - output huffman statistics\n",
out);
}
int main(int argc, char** argv)
{
char memory = 0;
char compress = 1;
int opt;
const char *file_in = NULL, *file_out = NULL;
const char *file_out_table = NULL;
FILE *in = stdin;
FILE *out = stdout;
FILE * outTable = NULL;
/* Get the command line arguments. */
while((opt = getopt(argc, argv, "i:o:cdhvmt:")) != -1) //演示如何跳出循环,及查找括号对
{
switch(opt)
{
case 'i':
file_in = optarg;
break;
case 'o':
file_out = optarg;
break;
case 'c':
compress = 1;
break;
case 'd':
compress = 0;
break;
case 'h':
usage(stdout);
return 0;
case 'v':
version(stdout);
return 0;
case 'm':
memory = 1;
break;
case 't':
file_out_table = optarg;
break;
default:
usage(stderr);
return 1;
}
}
/* If an input file is given then open it. */
if(file_in)
{
in = fopen(file_in, "rb");
if(!in)
{
fprintf(stderr,
"Can't open input file '%s': %s\n",
file_in, strerror(errno));
return 1;
}
}
/* If an output file is given then create it. */
if(file_out)
{
out = fopen(file_out, "wb");
if(!out)
{
fprintf(stderr,
"Can't open output file '%s': %s\n",
file_out, strerror(errno));
return 1;
}
}
if(file_out_table)
{
outTable = fopen(file_out_table, "w");
if(!outTable)
{
fprintf(stderr,
"Can't open output file '%s': %s\n",
file_out_table, strerror(errno));
return 1;
}
}
if(memory)
{
return compress ?
memory_encode_file(in, out) : memory_decode_file(in, out);
}
if(compress)
huffman_encode_file(in, out,outTable);
else
huffman_decode_file(in, out);
if(in)
fclose(in);
if(out)
fclose(out);
if(outTable)
fclose(outTable);
return 0;
}
static int memory_encode_file(FILE *in, FILE *out)
{
unsigned char *buf = NULL, *bufout = NULL;
unsigned int len = 0, cur = 0, inc = 1024, bufoutlen = 0;
assert(in && out);
/* Read the file into memory. */
while(!feof(in))
{
unsigned char *tmp;
len += inc;
tmp = (unsigned char*)realloc(buf, len);
if(!tmp)
{
if(buf)
free(buf);
return 1;
}
buf = tmp;
cur += fread(buf + cur, 1, inc, in);
}
if(!buf)
return 1;
/* Encode the memory. */
if(huffman_encode_memory(buf, cur, &bufout, &bufoutlen))
{
free(buf);
return 1;
}
free(buf);
/* Write the memory to the file. */
if(fwrite(bufout, 1, bufoutlen, out) != bufoutlen)
{
free(bufout);
return 1;
}
free(bufout);
return 0;
}
static int memory_decode_file(FILE *in, FILE *out)
{
unsigned char *buf = NULL, *bufout = NULL;
unsigned int len = 0, cur = 0, inc = 1024, bufoutlen = 0;
assert(in && out);
/* Read the file into memory. */
while(!feof(in))
{
unsigned char *tmp;
len += inc;
tmp = (unsigned char*)realloc(buf, len);
if(!tmp)
{
if(buf)
free(buf);
return 1;
}
buf = tmp;
cur += fread(buf + cur, 1, inc, in);
}
if(!buf)
return 1;
/* Decode the memory. */
if(huffman_decode_memory(buf, cur, &bufout, &bufoutlen))
{
free(buf);
return 1;
}
free(buf);
/* Write the memory to the file. */
if(fwrite(bufout, 1, bufoutlen, out) != bufoutlen)
{
free(bufout);
return 1;
}
free(bufout);
return 0;
}
getopt.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/* declarations to provide consistent linkage */
extern char *optarg;
extern int optind;
extern int opterr;
int opterr = 1, /* if error message should be printed */
optind = 1, /* index into parent argv vector */
optopt, /* character checked for validity */
optreset; /* reset getopt */
char *optarg; /* argument associated with option */
#define BADCH (int)'?'
#define BADARG (int)':'
#define EMSG ""
/*
* getopt --
* Parse argc/argv argument vector.
*/
int
getopt(int nargc, char * const *nargv, const char* ostr)
{
static char *place = EMSG; /* option letter processing */
char *oli; /* option letter list index */
if (optreset || !*place) { /* update scanning pointer */
optreset = 0;
if (optind >= nargc || *(place = nargv[optind]) != '-') {
place = EMSG;
return (EOF);
}
if (place[1] && *++place == '-') { /* found "--" */
++optind;
place = EMSG;
return (EOF);
}
} /* option letter okay? */
if ((optopt = (int)*place++) == (int)':' ||
!(oli = strchr(ostr, optopt))) {
/*
* if the user didn't specify '-' as an option,
* assume it means EOF.
*/
if (optopt == (int)'-')
return (EOF);
if (!*place)
++optind;
if (opterr && *ostr != ':')
(void)fprintf(stderr,
"%s: illegal option -- %c\n", __FILE__, optopt);
return (BADCH);
}
if (*++oli != ':') { /* don't need argument */
optarg = NULL;
if (!*place)
++optind;
}
else { /* need an argument */
if (*place) /* no white space */
optarg = place;
else if (nargc <= ++optind) { /* no arg */
place = EMSG;
if (*ostr == ':')
return (BADARG);
if (opterr)
(void)fprintf(stderr,
"%s: option requires an argument -- %c\n",
__FILE__, optopt);
return (BADCH);
}
else /* white space */
optarg = nargv[optind];
place = EMSG;
++optind;
}
return (optopt); /* dump back option letter */
}
Huff_code
huffman.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "huffman.h"
#ifdef WIN32
#include <winsock2.h>
#include <malloc.h>
#define alloca _alloca
#else
#include <netinet/in.h>
#endif
//////////定义节点结构体//////////
typedef struct huffman_node_tag
{
unsigned char isLeaf; //是否为树叶节点
unsigned long count; //信源中出现的频数
struct huffman_node_tag *parent; //定义父结点指针
union //如果不是树叶节点,该项为该节点左右子结点的指针
{ //如果是树叶节点,该项为某个信源符号
struct
{
struct huffman_node_tag *zero, *one;
};
unsigned char symbol;
};
} huffman_node;
//////////定义码字结构体//////////
typedef struct huffman_code_tag
{
/* The length of this code in bits. */
unsigned long numbits; //码字的长度(单位bit)
//码字的第一位存于bits[0]的第一位
//码字的第二位存于bits[0]的第二位
//码字的第八位存于bits[0]的第八位
//码字的第九位存于bits[1]的第一位
unsigned char *bits; //码字
} huffman_code;
//////////定义输出信息结构体//////////
typedef struct huffman_statistics_result
{
float freq[256]; //256个ASCII码各自出现的频率
unsigned long numbits[256]; //每个码的位数
unsigned char bits[256][100]; //假设256个码长不超过100的码字
}huffman_stat;
//////////将不足一字节的内容凑成一字节//////////
static unsigned long numbytes_from_numbits(unsigned long numbits)
{
return numbits / 8 + (numbits % 8 ? 1 : 0); //码长位数变为字节数
}
//////////取出码字中一位//////////
static unsigned char get_bit(unsigned char* bits, unsigned long i)
{
return (bits[i / 8] >> i % 8) & 1; //取出第i(从0开始)位 先取余再右移
}
//////////将编好的码字反序//////////
static void reverse_bits(unsigned char* bits, unsigned long numbits)
{
unsigned long numbytes = numbytes_from_numbits(numbits); //位数变字节
unsigned char *tmp =(unsigned char*)alloca(numbytes); //开辟空间
unsigned long curbit;
long curbyte = 0;
memset(tmp, 0, numbytes); //把numbytes字节的tmp全置成0
for(curbit = 0; curbit < numbits; ++curbit)
{
unsigned int bitpos = curbit % 8; //当前字节当前位
if(curbit > 0 && curbit % 8 == 0)
++curbyte;
tmp[curbyte] |= (get_bit(bits, numbits - curbit - 1) << bitpos); //从码字的最后一位开始取 移到当前位 与0或
}
memcpy(bits, tmp, numbytes); //从源src所指的内存地址的起始位置开始拷贝n个字节到目标dest所指的内存地址的起始位置中
}
//////////为树叶代表的信源符号编码//////////
static huffman_code*new_code(const huffman_node*leaf)
{
unsigned long numbits = 0; //码长
unsigned char* bits = NULL; //码字首地址
huffman_code *p; //定义指向码字结构体的指针
while(leaf && leaf->parent) //leaf!=0表示当前字符存在
{ //leaf->parent!=0表示当前在字符未编码完成 因为根节点没有父节点
huffman_node *parent = leaf->parent;
unsigned char cur_bit = (unsigned char)(numbits % 8); //所编位在当前byte中的位置
unsigned long cur_byte = numbits / 8; //当前是第几个byte
//realloc这里很关键,它与malloc不同
//它在保持原有的数据不变的情况下重新分配空间
//原有数据存在新空间中的前面部分
//(这里空间的地址可能有变化)
if(cur_bit == 0)
{
size_t newSize = cur_byte + 1;
bits = (unsigned char*)realloc(bits, newSize);
bits[newSize - 1] = 0; //Initialize the new byte. 初始化新分配的8bit为0
}
if(leaf == parent->one) //如果是右子节点(若是左子节点因为初始化bits是0所以不用编)
bits[cur_byte] |= 1 << cur_bit; //左移1至当前byte的当前位(特编位)
++numbits; //码字位数加一
leaf = parent; //把父节点作为下一个待编的 该编码过程是从树叶到树根的
}
if(bits)
reverse_bits(bits, numbits); //整个码字逆序
p = (huffman_code*)malloc(sizeof(huffman_code));
p->numbits = numbits; //为码字结构体赋值
p->bits = bits; //整数个字节,与numbits配合才可得到真正的码字
return p;
}
#define MAX_SYMBOLS 256
typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS];
typedef huffman_code* SymbolEncoder[MAX_SYMBOLS];
//256个huffman_code的指针
//位置上对应于ASCII的顺序,用于保存码表
//////////建立叶结点 参数为该树叶代表的信源符号//////////
static huffman_node*new_leaf_node(unsigned char symbol)
{
huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node)); //开辟指向一个节点的指针
p->isLeaf = 1; //是树叶
p->symbol = symbol; //赋符号
p->count = 0; //该节点的代表的频率
p->parent = 0; //父节点初始化为0
return p; //返回一个已经初始化的叶节点
}
//////////新建中间节点 参数为该节点代表的频率及左右子节点//////////
static huffman_node*new_nonleaf_node(unsigned long count, huffman_node *zero, huffman_node *one)
{
huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node)); //开辟指向一个节点空间的指针
p->isLeaf = 0; //不是树叶
p->count = count; //概率赋值
p->zero = zero; //左子节点赋值
p->one = one; //右子节点赋值
p->parent = 0; //父节点初始化为0
return p;
}
//////////释放Huffman码树 参数为一个节点//////////
static void free_huffman_tree(huffman_node *subtree)
{
if(subtree == NULL) //是否到了root,是则说明编码结束,return
return;
if(!subtree->isLeaf) //对于非树叶节点递归调用该函数左右子节点
{
free_huffman_tree(subtree->zero);
free_huffman_tree(subtree->one);
}
free(subtree);
}
//////////释放Huffman码 参数为一个码指针//////////
static void free_code(huffman_code* p)
{
free(p->bits);
free(p);
}
//////////释放指向256个节点的指针数组//////////
static void free_encoder(SymbolEncoder *pSE)
{
unsigned long i;
for(i = 0; i < MAX_SYMBOLS; ++i)
{
huffman_code *p = (*pSE)[i]; //把每一个指针数组指向的每一个节点赋给节点指针p
if(p)
free_code(p); //通过释放256个p来释放数组
}
free(pSE);
}
//////////初始化一个含有256节点指针的数组//////////
static void init_frequencies(SymbolFrequencies *pSF)
{
memset(*pSF, 0, sizeof(SymbolFrequencies)); //全部初始化为0
}
typedef struct buf_cache_tag
{
unsigned char *cache;
unsigned int cache_len;
unsigned int cache_cur;
unsigned char **pbufout;
unsigned int *pbufoutlen;
} buf_cache;
static int init_cache(buf_cache* pc,
unsigned int cache_size,
unsigned char **pbufout,
unsigned int *pbufoutlen)
{
assert(pc && pbufout && pbufoutlen);
if(!pbufout || !pbufoutlen)
return 1;
pc->cache = (unsigned char*)malloc(cache_size);
pc->cache_len = cache_size;
pc->cache_cur = 0;
pc->pbufout = pbufout;
*pbufout = NULL;
pc->pbufoutlen = pbufoutlen;
*pbufoutlen = 0;
return pc->cache ? 0 : 1;
}
static void free_cache(buf_cache* pc)
{
assert(pc);
if(pc->cache)
{
free(pc->cache);
pc->cache = NULL;
}
}
static int flush_cache(buf_cache* pc)
{
assert(pc);
if(pc->cache_cur > 0)
{
unsigned int newlen = pc->cache_cur + *pc->pbufoutlen;
unsigned char* tmp = realloc(*pc->pbufout, newlen);
if(!tmp)
return 1;
memcpy(tmp + *pc->pbufoutlen, pc->cache, pc->cache_cur);
*pc->pbufout = tmp;
*pc->pbufoutlen = newlen;
pc->cache_cur = 0;
}
return 0;
}
static int write_cache(buf_cache* pc,
const void *to_write,
unsigned int to_write_len)
{
unsigned char* tmp;
assert(pc && to_write);
assert(pc->cache_len >= pc->cache_cur);
if(to_write_len > pc->cache_len - pc->cache_cur)
{
unsigned int newlen;
flush_cache(pc);
newlen = *pc->pbufoutlen + to_write_len;
tmp = realloc(*pc->pbufout, newlen);
if(!tmp)
return 1;
memcpy(tmp + *pc->pbufoutlen, to_write, to_write_len);
*pc->pbufout = tmp;
*pc->pbufoutlen = newlen;
}
else
{
/* Write the data to the cache. */
memcpy(pc->cache + pc->cache_cur, to_write, to_write_len);
pc->cache_cur += to_write_len;
}
return 0;
}
//////////第一次扫描,统计信源字符发生频率//////////
static unsigned int get_symbol_frequencies(SymbolFrequencies *pSF, FILE *in)
{
int c;
unsigned int total_count = 0; //总信源符号初始化为0
/////设置所有频率为0/////
init_frequencies(pSF); //将所有信源符号地址初始化为NULL(0)
/////统计输入文件里每个符号出现的频率/////
while((c = fgetc(in)) != EOF) //第一遍扫描文件
{
unsigned char uc = c;
if(!(*pSF)[uc]) //如果是一个新符号,则产生该字符的一个新叶节点
(*pSF)[uc] = new_leaf_node(uc);
++(*pSF)[uc]->count; //当前字符出现的频数加一
++total_count; //总信源符号数加一
}
return total_count;
}
static unsigned int get_symbol_frequencies_from_memory(SymbolFrequencies *pSF,
const unsigned char *bufin,
unsigned int bufinlen)
//从缓存中获得信源符号概率 把上段代码作为信源来源的输入文件换成了bufin
{
unsigned int i;
unsigned int total_count = 0;
/* Set all frequencies to 0. */
init_frequencies(pSF);
/* Count the frequency of each symbol in the input file. */
for(i = 0; i < bufinlen; ++i)
{
unsigned char uc = bufin[i];
if(!(*pSF)[uc])
(*pSF)[uc] = new_leaf_node(uc);
++(*pSF)[uc]->count;
++total_count;
}
return total_count;
}
//////////定义排序的标准 按符号出现次数升序排列//////////
static int SFComp(const void *p1, const void *p2)
{
/////强制转换p1、p2为霍夫曼节点的二维指针 并将第一行指针赋给hn1、hn2
const huffman_node *hn1 = *(const huffman_node**)p1;
const huffman_node *hn2 = *(const huffman_node**)p2;
/* Sort all NULLs to the end. */
if(hn1 == NULL && hn2 == NULL)
return 0;
if(hn1 == NULL)
return 1;
if(hn2 == NULL)
return -1;
if(hn1->count > hn2->count)
return 1;
else if(hn1->count < hn2->count)
return -1;
return 0;
}
#if 1
//////////打印256个信源及其出现次数//////////
static void print_freqs(SymbolFrequencies * pSF)
{
size_t i;
for(i = 0; i < MAX_SYMBOLS; ++i)
{
if((*pSF)[i])
printf("%d, %ld\n", (*pSF)[i]->symbol, (*pSF)[i]->count);
else
printf("NULL\n");
}
}
#endif
//////////为树叶编码,输入树根遍历码数找到树叶进行编码//////////
static void build_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSF)
//256个huffman_code的指针,位置上对应于 ASCII的顺序,用于保存码表
{
if(subtree == NULL) //是否已经到了root,是则说明编码结束,return
return;
if(subtree->isLeaf) //是叶结点则产生码字
(*pSF)[subtree->symbol] = new_code(subtree);
else
{
build_symbol_encoder(subtree->zero, pSF); //递归,中序遍历
build_symbol_encoder(subtree->one, pSF);
}
}
//////////生成码树并编码//////////
static SymbolEncoder* calculate_huffman_codes(SymbolFrequencies * pSF) //按频率从小到大顺序排序并建立Huffman树
{
unsigned int i = 0;
unsigned int n = 0;
huffman_node *m1 = NULL, *m2 = NULL;//初始化两个用来排序的节点结构体
SymbolEncoder *pSE = NULL; //初始化一个码字结构体指针
#if 1 //排序前,按数组顺序打印每片树叶代表的信源符号和出现的次数
printf("BEFORE SORT\n");
print_freqs(pSF); //演示堆栈的使用
#endif
/////按信源符号出现频率大小排序,小概率符号在前(pSF数组中),下标较小/////
qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp);
//讲解SFComp函数的作用,断点在调试程序里的作用
#if 1 //排序后,按数组顺序打印每片树叶代表的信源符号和出现的次数
printf("AFTER SORT\n");
print_freqs(pSF); //对树叶按概率排序后再次打印其符号和次数
#endif
/////得到当前待编码文件中所出现的信源符号的种类数/////
for(n = 0; n < MAX_SYMBOLS && (*pSF)[n]; ++n); //计算非空节点数n
/////循环n-1次,生成中间节点并建立节点间的相互关系/////
for(i = 0; i < n - 1; ++i)
//因为二叉树中度为0的节点比度为2的节点多1个,霍夫曼码树的非树叶节点度均为2,因此生成非树叶节点的个数为n-1
{
/* Set m1 and m2 to the two subsets of least probability. */
m1 = (*pSF)[0];
m2 = (*pSF)[1];
/* Replace m1 and m2 with a set {m1, m2} whose probability
* is the sum of that of m1 and m2. */
(*pSF)[0] = m1->parent = m2->parent =new_nonleaf_node(m1->count + m2->count, m1, m2);
//将出现序列中出现次数最少的两个符号的次数和、左右子节点作为参数建立新的非树叶节点赋给这两个节点的父节点指针,并将此节点作为节点指针数组新的0号元素
(*pSF)[1] = NULL; //已经加过的节点置空
/* Put newSet into the correct count position in pSF. */
qsort((*pSF), n, sizeof((*pSF)[0]), SFComp); //加入新的节点,再次排序
}
/* Build the SymbolEncoder array from the tree. */ //由建立的huffman树对计算每个符号的码字
pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder)); //为存放256个码字开辟空间
memset(pSE, 0, sizeof(SymbolEncoder)); //将pse先全部置零
build_symbol_encoder((*pSF)[0], pSE);
//(*pSF)[0]是根节点 pSE是一个全为零的码字指针数组[256] 通过根节点能找到整棵树 并从树叶开始向上编码 这是关键的语句 有递归遍历 和编码 如果出错 可以进入里面看具体哪一步出错
return pSE;
}
//////////把码表写到输出文件//////////
static int write_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count)//写码表
{
unsigned long i, count = 0;
/* Determine the number of entries in se. */
for(i = 0; i < MAX_SYMBOLS; ++i)
{
if((*se)[i])
++count; //根据非空码字结构体的多少来计算有多少个码字
}
/* Write the number of entries in network byte order. */
i = htonl(count); //在网络传输中,采用big-endian序,对于0x0A0B0C0D ,传输顺序就是0A 0B 0C 0D ,
//因此big-endian作为network byte order,little-endian作为host byte order。
//little-endian的优势在于unsigned char/short/int/long类型转换时,存储位置无需改变
if(fwrite(&i, sizeof(i), 1, out) != 1) //码字总数写入输出文件
return 1;
/* Write the number of bytes that will be encoded. */
symbol_count = htonl(symbol_count); //文件字节数
if(fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1) //文件字节数写入输出文件
return 1;
/////将Huffman码表写入文件/////
for(i = 0; i < MAX_SYMBOLS; ++i)
{
huffman_code *p = (*se)[i];
if(p)
{
unsigned int numbytes;
/* Write the 1 byte symbol. */
fputc((unsigned char)i, out); //写符号的ASCII码十进制
/* Write the 1 byte code bit length. */
fputc(p->numbits, out); //写码长
/* Write the code bytes. */
numbytes = numbytes_from_numbits(p->numbits); //位数变字节
if(fwrite(p->bits, 1, numbytes, out) != numbytes)
//一次写一个字符的码字进去,因为长度设置的是该码字的字节数,fwrite的返回值为实际写入的数据项个数numbytes,所以当p不为空时,就会循环写入,return 1这条语句正常情况下永远不会执行,因此成功写入最后会return 0
return 1;
}
}
return 0;
}
static int write_code_table_to_memory(buf_cache *pc,
SymbolEncoder *se,
unsigned int symbol_count)
{
unsigned long i, count = 0;
/* Determine the number of entries in se. */
for(i = 0; i < MAX_SYMBOLS; ++i)
{
if((*se)[i])
++count;
}
/* Write the number of entries in network byte order. */
i = htonl(count);
if(write_cache(pc, &i, sizeof(i)))
return 1;
/* Write the number of bytes that will be encoded. */
symbol_count = htonl(symbol_count);
if(write_cache(pc, &symbol_count, sizeof(symbol_count)))
return 1;
/* Write the entries. */
for(i = 0; i < MAX_SYMBOLS; ++i)
{
huffman_code *p = (*se)[i];
if(p)
{
unsigned int numbytes;
/* The value of i is < MAX_SYMBOLS (256), so it can
be stored in an unsigned char. */
unsigned char uc = (unsigned char)i;
/* Write the 1 byte symbol. */
if(write_cache(pc, &uc, sizeof(uc)))
return 1;
/* Write the 1 byte code bit length. */
uc = (unsigned char)p->numbits;
if(write_cache(pc, &uc, sizeof(uc)))
return 1;
/* Write the code bytes. */
numbytes = numbytes_from_numbits(p->numbits);
if(write_cache(pc, p->bits, numbytes))
return 1;
}
}
return 0;
}
//////////读取码表并重建据此Huffman树//////////
static huffman_node*read_code_table(FILE* in, unsigned int *pDataBytes)
{
huffman_node *root = new_nonleaf_node(0, NULL, NULL);
unsigned int count;
/* Read the number of entries.
(it is stored in network byte order). */
if(fread(&count, sizeof(count), 1, in) != 1)
//得到码表中的符号数
{
free_huffman_tree(root);
return NULL;
}
count = ntohl(count);
/* Read the number of data bytes this encoding represents. */
if(fread(pDataBytes, sizeof(*pDataBytes), 1, in) != 1)
{
free_huffman_tree(root);
return NULL;
}
*pDataBytes = ntohl(*pDataBytes);
/* Read the entries. */
while(count-- > 0)
//检查是否仍有叶节点未建立,每循环一次建立起一条由根节点至叶节点(符号)的路径
{
int c;
unsigned int curbit;
unsigned char symbol;
unsigned char numbits;
unsigned char numbytes;
unsigned char *bytes;
huffman_node *p = root;
if((c = fgetc(in)) == EOF)
{
free_huffman_tree(root);
return NULL;
}
symbol = (unsigned char)c; //符号
if((c = fgetc(in)) == EOF)
{
free_huffman_tree(root);
return NULL;
}
numbits = (unsigned char)c; //码长
numbytes = (unsigned char)numbytes_from_numbits(numbits);
bytes = (unsigned char*)malloc(numbytes); //为读取码字分配空间
if(fread(bytes, 1, numbytes, in) != numbytes) //读取码字
{
free(bytes);
free_huffman_tree(root);
return NULL;
}
for(curbit = 0; curbit < numbits; ++curbit)
//读取当前码字的每一位
//并依据读取的结果逐步建立起由根节点至该符号叶节点的路径
{
if(get_bit(bytes, curbit)) //当前读取位是否为‘1’
{ //当前读取位为‘1’
if(p->one == NULL)
{
p->one = curbit == (unsigned char)(numbits - 1)
//是否是当前码字的最后一位
//是,则新建叶节点
//不是,则新建非叶节点
? new_leaf_node(symbol)
: new_nonleaf_node(0, NULL, NULL);
p->one->parent = p; //‘1’的一枝的父节点指向当前节点
}
p = p->one; //沿‘1’方向下移一级
}
else
{ //当前读取位为‘0’
if(p->zero == NULL)
{
p->zero = curbit == (unsigned char)(numbits - 1)
? new_leaf_node(symbol)
: new_nonleaf_node(0, NULL, NULL);
p->zero->parent = p;
}
p = p->zero;
}
}
free(bytes);
}
return root; //返回Huffman树的根节点
}
static int memread(const unsigned char* buf,
unsigned int buflen,
unsigned int *pindex,
void* bufout,
unsigned int readlen)
{
assert(buf && pindex && bufout);
assert(buflen >= *pindex);
if(buflen < *pindex)
return 1;
if(readlen + *pindex >= buflen)
return 1;
memcpy(bufout, buf + *pindex, readlen);
*pindex += readlen;
return 0;
}
static huffman_node*read_code_table_from_memory(const unsigned char* bufin,
unsigned int bufinlen,
unsigned int *pindex,
unsigned int *pDataBytes)
{
huffman_node *root = new_nonleaf_node(0, NULL, NULL);
unsigned int count;
/* Read the number of entries.
(it is stored in network byte order). */
if(memread(bufin, bufinlen, pindex, &count, sizeof(count)))
{
free_huffman_tree(root);
return NULL;
}
count = ntohl(count);
/* Read the number of data bytes this encoding represents. */
if(memread(bufin, bufinlen, pindex, pDataBytes, sizeof(*pDataBytes)))
{
free_huffman_tree(root);
return NULL;
}
*pDataBytes = ntohl(*pDataBytes);
/* Read the entries. */
while(count-- > 0)
{
unsigned int curbit;
unsigned char symbol;
unsigned char numbits;
unsigned char numbytes;
unsigned char *bytes;
huffman_node *p = root;
if(memread(bufin, bufinlen, pindex, &symbol, sizeof(symbol)))
{
free_huffman_tree(root);
return NULL;
}
if(memread(bufin, bufinlen, pindex, &numbits, sizeof(numbits)))
{
free_huffman_tree(root);
return NULL;
}
numbytes = (unsigned char)numbytes_from_numbits(numbits);
bytes = (unsigned char*)malloc(numbytes);
if(memread(bufin, bufinlen, pindex, bytes, numbytes))
{
free(bytes);
free_huffman_tree(root);
return NULL;
}
for(curbit = 0; curbit < numbits; ++curbit)
{
if(get_bit(bytes, curbit))
{
if(p->one == NULL)
{
p->one = curbit == (unsigned char)(numbits - 1)
? new_leaf_node(symbol)
: new_nonleaf_node(0, NULL, NULL);
p->one->parent = p;
}
p = p->one;
}
else
{
if(p->zero == NULL)
{
p->zero = curbit == (unsigned char)(numbits - 1)
? new_leaf_node(symbol)
: new_nonleaf_node(0, NULL, NULL);
p->zero->parent = p;
}
p = p->zero;
}
}
free(bytes);
}
return root;
}
//////////把编码后的数据写入输出文件//////////
static int do_file_encode(FILE* in, FILE* out, SymbolEncoder *se)
{
unsigned char curbyte = 0; //当前字节的码字
unsigned char curbit = 0; //当前字节的当前位
int c;
while((c = fgetc(in)) != EOF) //遍历文件的每一个字符(/字节)
{
unsigned char uc = (unsigned char)c;
huffman_code *code = (*se)[uc]; //查表
unsigned long i;
for(i = 0; i < code->numbits; ++i) //将码字写入文件
{
/* Add the current bit to curbyte. */
curbyte |= get_bit(code->bits, i) << curbit; //取码字
/* If this byte is filled up then write it
* out and reset the curbit and curbyte. */
if(++curbit == 8)
//将取出的码字以字节为单位写入,当前位不是一字节时,一样执行curbit加1操作,让下一次循环取码字能左移到下一位
{
fputc(curbyte, out); //码字写入输出文件
curbyte = 0; //码字置零
curbit = 0; //当前位置零
}
}
}
/*
* If there is data in curbyte that has not been
* output yet, which means that the last encoded
* character did not fall on a byte boundary,
* then output it.
*/
if(curbit > 0)
//当剩余未写入码字不够一字节,不能通过上面的循环中if写入,所以再补充一句,如果还有剩余比特未被写入,就继续写入输出文件
fputc(curbyte, out);
return 0;
}
static int do_memory_encode(buf_cache *pc,
const unsigned char* bufin,
unsigned int bufinlen,
SymbolEncoder *se)
{
unsigned char curbyte = 0;
unsigned char curbit = 0;
unsigned int i;
for(i = 0; i < bufinlen; ++i)
{
unsigned char uc = bufin[i];
huffman_code *code = (*se)[uc];
unsigned long i;
for(i = 0; i < code->numbits; ++i)
{
/* Add the current bit to curbyte. */
curbyte |= get_bit(code->bits, i) << curbit;
/* If this byte is filled up then write it
* out and reset the curbit and curbyte. */
if(++curbit == 8)
{
if(write_cache(pc, &curbyte, sizeof(curbyte)))
return 1;
curbyte = 0;
curbit = 0;
}
}
}
/*
* If there is data in curbyte that has not been
* output yet, which means that the last encoded
* character did not fall on a byte boundary,
* then output it.
*/
return curbit > 0 ? write_cache(pc, &curbyte, sizeof(curbyte)) : 0;
}
//////////写存储统计信息的结构体 存储频率和符号数//////////
int huffST_getSymFrequencies(SymbolFrequencies *SF, huffman_stat *st,int total_count)
{
int i,count =0;
for(i = 0; i < MAX_SYMBOLS; ++i)
{
if((*SF)[i])
{
st->freq[i]=(float)(*SF)[i]->count/total_count; //计算每个符号的频率并赋值到结果表中
count+=(*SF)[i]->count; //计算信源符号总数
}
else
{
st->freq[i]= 0; //没有出现过的信源符号频率为0
}
}
if(count==total_count)
return 1;
else
return 0;
}
//////////写存储统计信息的结构体 存储码字和码长//////////
int huffST_getcodeword(SymbolEncoder *se, huffman_stat *st)
{
unsigned long i,j;
for(i = 0; i < MAX_SYMBOLS; ++i)
{
huffman_code *p = (*se)[i];
if(p)
{
unsigned int numbytes;
st->numbits[i] = p->numbits; //把码字结构体里面的码字位数赋给输出信息的码字位数
numbytes = numbytes_from_numbits(p->numbits); //位数变字节,用于下面索引到正确的码字
for (j=0;j<numbytes;j++)
st->bits[i][j] = p->bits[j];//把码字结构体里面的码字赋给输出信息的码字
}
else
st->numbits[i] =0; //如果该码字结构体为空,则该符号没有在文件中出现,没有编码
}
return 0;
}
//////////输出统计信息表文件//////////
void output_huffman_statistics(huffman_stat *st,FILE *out_Table)
{
int i,j;
unsigned char c;
fprintf(out_Table,"symbol\t freq\t codelength\t code\n"); //在输出文件中打印表头
for(i = 0; i < MAX_SYMBOLS; ++i)
{
fprintf(out_Table,"%d\t ",i); //输出 符号的ASCII码十进制表示
fprintf(out_Table,"%f\t ",st->freq[i]); //输出 符号在输入文件中出现的频率
fprintf(out_Table,"%d\t ",st->numbits[i]); //输出 符号码字的码长
if(st->numbits[i]) //码长不为0 就输出码字
{
for(j = 0; j < st->numbits[i]; ++j) //循环取码字的每一位,从高到低输出到文件中
{
c =get_bit(st->bits[i], j);
fprintf(out_Table,"%d",c);
}
}
fprintf(out_Table,"\n");
}
}
//////////进行Huffman编码//////////
int huffman_encode_file(FILE *in, FILE *out, FILE *out_Table)
//Huffman编码,增加一个FILE *out_Table,用于输出表格
{
SymbolFrequencies sf; //含有256个节点的数组
SymbolEncoder *se; //指向256个编码的指针
huffman_node *root = NULL; //根节点
int rc;
unsigned int symbol_count; //文件中总ASCII码数
huffman_stat hs; //输出结果的表 包括符号频率 码长 码字等
/////获取输入文件的每个符号的出现概率/////
symbol_count = get_symbol_frequencies(&sf, in); //演示扫描完一遍文件后,SF指针数组的每个元素的构成
//sf中每个节点所代表的信源符号出现的次数count已经被赋值
huffST_getSymFrequencies(&sf,&hs,symbol_count);
/////从符号统计来建立一个最理想的表格来/////
se = calculate_huffman_codes(&sf); //编码 256个节点传入得到256个码字
root = sf[0]; //根节点
huffST_getcodeword(se, &hs); //为输出信息赋值
output_huffman_statistics(&hs,out_Table); //输出信息
/////再次扫描文件,用预先建立的表格把它编成输出文件/////
rewind(in); //将输入文件的内部指针重新指向文件开头
rc = write_code_table(out, se, symbol_count); //写码表
if(rc == 0) //成功写入码表后,rc就被赋值为0
rc = do_file_encode(in, out, se); //写编码后的文件,返回值为0
/////释放Huffman码树/////
free_huffman_tree(root); //释放码树
free_encoder(se); //释放码字结构体
return rc;
}
//////////读取Huffman码字,并解码输出//////////
int huffman_decode_file(FILE *in, FILE *out)
{
huffman_node *root, *p;
int c;
unsigned int data_count;
/* Read the Huffman code table. */
root = read_code_table(in, &data_count);
if(!root)
return 1; //Huffman树建立失败
/* Decode the file. */
p = root;
while(data_count > 0 && (c = fgetc(in)) != EOF)
//data_count>0:逻辑上仍有数据;(c=fgetc(in)!=EOF):文件中仍有数据
{
unsigned char byte = (unsigned char)c; //1byte的码字
unsigned char mask = 1; //mask用于逐位读出码字
while(data_count > 0 && mask) //loop9:mask=0x00000000,跳出循环
{
p = byte & mask ? p->one : p->zero; //沿Huffman树前进
mask <<= 1; //loop1:byte&0x00000001
//loop2:byte&0x00000010
//……
//loop8:byte&0x10000000
if(p->isLeaf) //至叶节点(解码完毕)
{
fputc(p->symbol, out);
p = root;
--data_count;
}
}
}
free_huffman_tree(root); //所有Huffman码字均已解码输出,文件解码完毕
return 0;
}
#define CACHE_SIZE 1024
int huffman_encode_memory(const unsigned char *bufin,
unsigned int bufinlen,
unsigned char **pbufout,
unsigned int *pbufoutlen)
{
SymbolFrequencies sf;
SymbolEncoder *se;
huffman_node *root = NULL;
int rc;
unsigned int symbol_count;
buf_cache cache;
/* Ensure the arguments are valid. */
if(!pbufout || !pbufoutlen)
return 1;
if(init_cache(&cache, CACHE_SIZE, pbufout, pbufoutlen))
return 1;
/* Get the frequency of each symbol in the input memory. */
symbol_count = get_symbol_frequencies_from_memory(&sf, bufin, bufinlen);
/* Build an optimal table from the symbolCount. */
se = calculate_huffman_codes(&sf);
root = sf[0];
/* Scan the memory again and, using the table
previously built, encode it into the output memory. */
rc = write_code_table_to_memory(&cache, se, symbol_count);
if(rc == 0)
rc = do_memory_encode(&cache, bufin, bufinlen, se);
/* Flush the cache. */
flush_cache(&cache);
/* Free the Huffman tree. */
free_huffman_tree(root);
free_encoder(se);
free_cache(&cache);
return rc;
}
int huffman_decode_memory(const unsigned char *bufin,
unsigned int bufinlen,
unsigned char **pbufout,
unsigned int *pbufoutlen)
{
huffman_node *root, *p;
unsigned int data_count;
unsigned int i = 0;
unsigned char *buf;
unsigned int bufcur = 0;
/* Ensure the arguments are valid. */
if(!pbufout || !pbufoutlen)
return 1;
/* Read the Huffman code table. */
root = read_code_table_from_memory(bufin, bufinlen, &i, &data_count);
if(!root)
return 1;
buf = (unsigned char*)malloc(data_count);
/* Decode the memory. */
p = root;
for(; i < bufinlen && data_count > 0; ++i)
{
unsigned char byte = bufin[i];
unsigned char mask = 1;
while(data_count > 0 && mask)
{
p = byte & mask ? p->one : p->zero;
mask <<= 1;
if(p->isLeaf)
{
buf[bufcur++] = p->symbol;
p = root;
--data_count;
}
}
}
free_huffman_tree(root);
*pbufout = buf;
*pbufoutlen = bufcur;
return 0;
}
实验结果分析
程序运行完成之后生成一个huff文件(作为编码后的输出文件,包含码表和编码后的数据)和统计数据文件,对一个ppt文件编码后生成的统计数据txt文件如图:
各格式样本文件的概率分布如下图所示:
统计结果如下表:
文件类型 | 平均马场 | 信息熵 | 原文件大小(kb) | 压缩后文件大小 | 压缩比 |
---|---|---|---|---|---|
ppt | 6.337851 | 6.309532 | 182 | 146 | 1.247 |
7.630489 | 7.587450 | 300 | 287 | 1.045 | |
png | 7.999920 | 7.997800 | 328 | 329 | 0.996 |
bmp | 7.835411 | 7.799278 | 554 | 544 | 1.018 |
docx | 7.999294 | 7.995741 | 264 | 265 | 0.996 |
jpg | 7.868257 | 7.842246 | 1062 | 1045 | 1.016 |
html | 5.015010 | 6.260903 | 31 | 25 | 1.240 |
txt | 5.896542 | 5.866860 | 1 | 1 | 1.000 |
rar | 7.999984 | 7.998295 | 4386 | 4386 | 1.000 |
gif | 7.939629 | 7.909703 | 59 | 58 | 1.017 |