基于Huffman编码的压缩和解压缩小软件(附C++源码)
程序员文章站
2022-07-02 20:46:26
...
压缩前:
将pic.png拖到.exe文件上,可得到.zLzip压缩文件:
编码过程:
压缩过程:
将.zLzip压缩文件拖回可解压缩得到原文件:
顺便一提,当原文件内字符分布均衡时,其信息熵很低,压缩效果不太好。
代码如下(编译器是TDM-GCC 4.9.2 64-bit Release, C++11标准)
main.cpp文件:
#include "HuffmanEncoderCompress.h"
void test();
void test2(int argc, char** argv);
int main(int argc, char** argv) {
// test(); // 本地测试
test2(argc, argv); // 界面拖动
return 0;
}
// 本地测试
void test() {
HuffmanEncoderCompress* hecp = new HuffmanEncoderCompress("test\\pic.png");
hecp->run();
// hecp->printInfo();
int n = 0;
for (int i = 0; i < n; ++i) {
HuffmanEncoderCompress* hecp2 = new HuffmanEncoderCompress(hecp->getOutputFileName(), true);
hecp2->run();
delete hecp;
hecp = hecp2;
}
delete hecp;
}
// 读取输入(可在界面中拖动)
void test2(int argc, char** argv) {
if (argc < 2) exit(-1);
HuffmanEncoderCompress* hecp = new HuffmanEncoderCompress(argv[1]);
hecp->run();
delete hecp;
getchar(); // 防止运行界面闪退
}
HuffmanEncoderCompress.h头文件:
#ifndef __HUFFMAN_ENCODER_COMPRESS_H__
#define __HUFFMAN_ENCODER_COMPRESS_H__
#include <string>
using std::string;
constexpr int _CODE_NUM = 256; // 码数
constexpr int _ZIP_NAME_LEN = 16; // 压缩识别符长度
constexpr int _FILE_NAME_LEN = 256; // 文件名长度
constexpr int _FILE_SIZE_LEN = 8; // 文件大小值长度
constexpr int _CODE_FREQUENCY_LEN = 8; // 码频值长度
constexpr int _ZLZIP_HEAD_LEN = _ZIP_NAME_LEN + _FILE_NAME_LEN + _FILE_SIZE_LEN // 头文件信息长度
+ _CODE_FREQUENCY_LEN * _CODE_NUM;
static const char _ZIP_NAME[_ZIP_NAME_LEN] = "zLimbo zLzip"; // 压缩 识别符
// 编码类
struct Code {
unsigned char oldCode; // 旧码
unsigned long long frequency; // 频率
unsigned long long newCode; // 新码
string newCodeStr; // 新码字符串表示 debug
int length; // 新码长度
Code(): oldCode(0), frequency(0), newCode(0), length(0) { }
};
// Huffman 节点
struct HuffmanTreeNode {
unsigned long long weight; // 权重
Code* codePtr; // 指向码(叶子节点才有指向)
HuffmanTreeNode* left; // 左分支
HuffmanTreeNode* right; // 右分支
HuffmanTreeNode(unsigned long long w = 0, Code* cp = nullptr,
HuffmanTreeNode* l = nullptr, HuffmanTreeNode* r = nullptr):
weight(w), codePtr(cp), left(l), right(r)
{ }
};
// 自定义比较器,用于优先队列
class CmparatorOfHuffmanTreeNode {
public:
bool operator() (HuffmanTreeNode*& lhs, HuffmanTreeNode*& rhs) const {
return lhs->weight > rhs->weight;
}
};
class HuffmanEncoderCompress {
private:
Code _codes[_CODE_NUM]; // 码
bool _isCompress; // 是否压缩(用于多次压缩 )
string _inputFileName; // 输入文件名
string _outputFileName; // 输出文件名
unsigned long long _inputFileSize; // 输入文件大小
unsigned long long _outputFileSize; // 输出文件大小
HuffmanTreeNode* _huffmanTreeRoot; // Huffman树根节点
private:
// 释放节点
freeNode(HuffmanTreeNode* np);
public:
// 构造
HuffmanEncoderCompress(const string& inputFileName, bool isCompress = false);
// 析构
~HuffmanEncoderCompress();
// 无法拷贝构造
HuffmanEncoderCompress(const HuffmanEncoderCompress& hec) = delete;
// 运行
void run();
// 统计频率
void statisticalFrequency();
// 构建 Huffman 树
HuffmanTreeNode* buildHuffmanTree();
// 获得 Huffman 编码
void getNewCodes(HuffmanTreeNode* np, unsigned long long newCode, string newCodeStr, int length);
// 压缩文件
void compress();
// 搜索节点,辅助于解码
bool findNode(HuffmanTreeNode*& np, unsigned char inputByte, int& pos);
// 解压缩
void uncompress();
// 打印哈夫曼编码信息
void printHuffmanEncodeInfo();
// 打印压缩或解压缩信息
void printInfo(const char* type);
// 获取输出文件名
string getOutputFileName() const { return _outputFileName; }
// 判断两个文件是否相等
bool compare2File(const string& fileName1, const string& fileName2);
};
#endif
HuffmanEncoderCompress.cpp 文件
#include "HuffmanEncoderCompress.h"
#include <cstdio>
#include <cstring>
#include <queue>
using namespace std;
HuffmanEncoderCompress::HuffmanEncoderCompress(const string& inputFileName, bool isCompress):
_isCompress(isCompress),
_inputFileName(inputFileName), _outputFileName(inputFileName+".zLzip"),
_inputFileSize(0), _outputFileSize(0), _huffmanTreeRoot(nullptr)
{
for (int i = 0; i < _CODE_NUM; ++i) _codes[i].oldCode = i;
}
// 释放节点及其子树
HuffmanEncoderCompress::freeNode(HuffmanTreeNode* np) {
if (np) {
freeNode(np->left);
freeNode(np->right);
delete np;
}
}
// 析构
HuffmanEncoderCompress::~HuffmanEncoderCompress() {
freeNode(_huffmanTreeRoot);
}
// 运行
void HuffmanEncoderCompress::run() {
FILE *inputFp = NULL;
if ((inputFp = fopen(_inputFileName.c_str(), "rb")) == NULL) {
printf("open file %s failed!\n", _inputFileName.c_str());
exit(-1);
}
char zipName[_ZIP_NAME_LEN];
fread(zipName, _ZIP_NAME_LEN, 1, inputFp);
if (_isCompress || strcmp(zipName, _ZIP_NAME)) { // 无识别符,非压缩文件
fclose(inputFp);
printf("开始压缩文件%s......\n", _inputFileName.c_str());
printf("正在统计频率......\n");
statisticalFrequency(); // 统计频率
printf("正在构建哈夫曼树......\n");
_huffmanTreeRoot = buildHuffmanTree(); // 构建哈夫曼树
printf("正在产生新编码......\n");
getNewCodes(_huffmanTreeRoot, 0, string(), 0); // 获得新编码
printHuffmanEncodeInfo();
printf("正在压缩......\n");
compress(); // 压缩
printInfo("压缩");
printf("压缩成功\n");
} else { // 有识别符,是压缩文件
printf("开始解压缩文件%s......\n", _inputFileName.c_str());
printf("读取原始文件信息......\n");
char outputFileName[_FILE_NAME_LEN];
fread(outputFileName, _FILE_NAME_LEN, 1, inputFp); // 读入原文件名
printf("原始文件名为%s\n", outputFileName);
_outputFileName = string(outputFileName);
fread(&_outputFileSize, _FILE_SIZE_LEN, 1, inputFp); // 读入原文件大小
for (int i = 0; i < _CODE_NUM; ++i) // 读入字符频率表
fread(&_codes[i].frequency, _CODE_FREQUENCY_LEN, 1, inputFp);
fclose(inputFp);
printf("正在构建哈夫曼树......\n");
_huffmanTreeRoot = buildHuffmanTree(); // 构建哈夫曼树
printf("正在产生新编码......\n");
getNewCodes(_huffmanTreeRoot, 0, string(), 0); // 获得新编码
printHuffmanEncodeInfo();
printf("正在解压缩......\n");
uncompress(); // 解压缩
printInfo("解压缩");
printf("解压成功\n");
}
}
// 统计频率
void HuffmanEncoderCompress::statisticalFrequency() {
// 打开文件
FILE *inputFp = NULL;
if ((inputFp = fopen(_inputFileName.c_str(), "rb")) == NULL) {
printf("open file %s failed!\n", _inputFileName.c_str());
exit(-1);
}
// 统计频率
while (!feof(inputFp)) {
unsigned char inputByte;
fread(&inputByte, 1, 1, inputFp);
if (feof(inputFp)) break;
++_codes[inputByte].frequency;
++_inputFileSize;
}
fclose(inputFp);
}
// 构建 Huffman 树
HuffmanTreeNode* HuffmanEncoderCompress::buildHuffmanTree() {
// 使用优先队列,自定义比较器
priority_queue<HuffmanTreeNode*, vector<HuffmanTreeNode*>, CmparatorOfHuffmanTreeNode> nps;
// 初始化叶子节点
for (int i = 0; i < _CODE_NUM; ++i)
nps.push(new HuffmanTreeNode(_codes[i].frequency, &_codes[i]));
// 取最小两个节点合并
while (true) {
HuffmanTreeNode* np1 = nps.top(); nps.pop();
HuffmanTreeNode* np2 = nps.top(); nps.pop();
HuffmanTreeNode* np3 = new HuffmanTreeNode(np1->weight + np2->weight, nullptr, np1, np2);
if (!nps.empty()) nps.push(np3);
else return np3; // 优先队列为空则构成 Huffman 树
}
}
// 获得 Huffman 编码
void HuffmanEncoderCompress::getNewCodes(HuffmanTreeNode* np, unsigned long long newCode, string newCodeStr, int length) {
// 是叶子节点则结束,记录编码
if (np->codePtr) {
np->codePtr->newCode = newCode;
np->codePtr->newCodeStr = newCodeStr;
np->codePtr->length = length;
return;
}
// 否则向左右分支探索
newCode <<= 1;
++length;
if (np->left) getNewCodes(np->left, newCode, newCodeStr+"0", length);
if (np->right) getNewCodes(np->right, newCode+1, newCodeStr+"1", length);
}
// 压缩文件
void HuffmanEncoderCompress::compress() {
FILE *inputFp = NULL;
if ((inputFp = fopen(_inputFileName.c_str(), "rb")) == NULL) {
printf("open file %s failed!\n", _inputFileName.c_str());
exit(-1);
}
FILE *outputFp = NULL;
if ((outputFp = fopen(_outputFileName.c_str(), "wb")) == NULL) {
printf("open file %s failed!\n", _outputFileName.c_str());
exit(-1);
}
// 写入压缩文件头信息
fwrite(_ZIP_NAME, _ZIP_NAME_LEN, 1, outputFp); // 识别符
fwrite(_inputFileName.c_str(), _FILE_NAME_LEN, 1, outputFp); // 文件名
fwrite(&_inputFileSize, _FILE_SIZE_LEN, 1, outputFp); // 文件大小
for (int i = 0; i < _CODE_NUM; ++i) // 字符频率,用以构建Huffman树
fwrite(&_codes[i].frequency, _CODE_FREQUENCY_LEN, 1, outputFp);
// 压缩所需临时辅助变量
unsigned char inputByte;
unsigned char outputByte = 0;
unsigned long long newCode;
int length;
int cnt = 0;
unsigned long long currentInputSize = 0;
unsigned long long currentOutputSize = 0;
double currRate = 0.0;
// string s; //debug
while (!feof(inputFp)) {
fread(&inputByte, 1, 1, inputFp);
if (feof(inputFp)) break;
// 记录速率
double rate = (double)(++currentInputSize) / _inputFileSize * 100;
if (rate - currRate >= 10) {
currRate = rate;
// system("cls");
printf("已压缩:%.1f%%\t压缩率:%.2f%%\n", currRate, (double)_outputFileSize/currentInputSize*100);
}
newCode = _codes[(int)inputByte].newCode;
length = _codes[(int)inputByte].length;
// printf("# %d(%s) ", newCode, _codes[(int)inputByte].newCodeStr.c_str()); //debug
while (length--) {
outputByte <<= 1;
outputByte += (newCode>>length) & 1;
// s += (newCode >> length) & 1 ? "1" : "0"; //debug
if (++cnt == 8) {
// printf("\t %d(%s)\n", (int)outputByte, s.c_str()); //debug
// s.clear(); //debug
// system("pause"); //debug
// if (length > 0) printf("# %s ", _codes[(int)inputByte].newCodeStr.c_str()); //debug
fwrite(&outputByte, 1, 1, outputFp);
outputByte = 0;
cnt = 0;
++_outputFileSize;
}
}
}
if (cnt < 8) { // 最后一个不足8比特填充 0
outputByte <<= 8-cnt;
fwrite(&outputByte, 1, 1, outputFp);
++_outputFileSize;
}
//system("cls");
// 打印压缩信息
printf("已压缩:%.1f%%\t压缩率:%.2f%%\n", 100.0, (double)_outputFileSize/currentInputSize*100);
fclose(inputFp);
fclose(outputFp);
}
// 搜索节点,辅助于解码
bool HuffmanEncoderCompress::findNode(HuffmanTreeNode*& np, unsigned char inputByte, int& pos) {
// 叶子节点则搜索成功
if (np->codePtr) return true;
if (pos < 0) return false;
int val = (inputByte >> pos) & 1;
--pos;
if (val == 0) {
np = np->left;
return findNode(np, inputByte, pos);
} else {
np = np->right;
return findNode(np, inputByte, pos);
}
}
// 解压缩
void HuffmanEncoderCompress::uncompress() {
FILE *inputFp = NULL;
if ((inputFp = fopen(_inputFileName.c_str(), "rb")) == NULL) {
printf("open file %s failed!\n", _inputFileName.c_str());
exit(-1);
}
FILE *outputFp = NULL;
if ((outputFp = fopen(_outputFileName.c_str(), "wb")) == NULL) {
printf("open file %s failed!\n", _outputFileName.c_str());
exit(-1);
}
fseek(inputFp, _ZLZIP_HEAD_LEN, SEEK_SET);
_inputFileSize = _ZLZIP_HEAD_LEN;
// 解压缩所需临时变量
unsigned char inputByte;
unsigned char outputByte;
HuffmanTreeNode* np = _huffmanTreeRoot;
unsigned long long currentOutputSize = 0;
int pos;
double currRate = 0.0;
while (!feof(inputFp)) {
fread(&inputByte, 1, 1, inputFp);
if (feof(inputFp)) break;
++_inputFileSize;
pos = 7;
// printf("%d(", (int)inputByte); //debug
// for (int i = 7; i >= 0; --i) { //debug
// printf("%d", (inputByte >> i) & 1); //debug
// } //debug
// printf(") "); //debug
while (findNode(np, inputByte, pos)) {
// if (!np) printf("np = nullptr\n"); //debug
// if (!np->codePtr) printf("np->codePtr = nullptr\n"); //debug
outputByte = np->codePtr->oldCode;
// printf("\tfind: %d(%s) ", (int)outputByte, np->codePtr->newCodeStr.c_str()); //debug
fwrite(&outputByte, 1, 1, outputFp);
// 记录速率
double rate = (double)(++currentOutputSize) / _outputFileSize * 100;
if (rate - currRate >= 10) {
currRate = rate;
// system("cls");
printf("已解压缩:%.1f%%\t解压缩率:%.2f%%\n", currRate, (double)currentOutputSize/_inputFileSize*100);
}
if (currentOutputSize == _outputFileSize) {
printf("已解压缩:%.1f%%\t解压缩率:%.2f%%\n", 100.0, (double)currentOutputSize/_inputFileSize*100);
break;
}
np = _huffmanTreeRoot;
}
// if (!findNode(np, inputByte, pos)) { //debug
// printf("\n"); //debug
// system("pause"); //debug
// }
}
fclose(inputFp);
fclose(outputFp);
}
// 打印哈夫曼编码信息
void HuffmanEncoderCompress::printHuffmanEncodeInfo() {
//printf("file size: %d B, %f KB, %f MB\n", _inputFileSize, _inputFileSize/1024.0, _inputFileSize/1024.0/1024.0);
printf("%-10s %-10s %-20s %-5s %-10s\n", "原码", "频率", "哈夫曼编码", "长度", "十进制");
for (int i = 0; i < _CODE_NUM; ++i) {
Code &code = _codes[i];
printf("%-10d %-10llu %-20s %-5d %-10llu\n", (int)code.oldCode, code.frequency,
code.newCodeStr.c_str(), code.length, code.newCode);
}
}
// 打印压缩或解压缩信息
void HuffmanEncoderCompress::printInfo(const char* type) {
double compressRate = (double)_outputFileSize / _inputFileSize * 100;
printf("%s率:%.2f%%\n", type, compressRate);
double inputFileSize = _inputFileSize, outputFileSize = _outputFileSize;
if (inputFileSize < 1024) {
printf("输入文件大小:%.2fB, 输出文件大小:%.2fB\n", inputFileSize, outputFileSize);
return;
}
inputFileSize /= 1024; outputFileSize /= 1024;
if (inputFileSize < 1024) {
printf("输入文件大小:%.2fKB, 输出文件大小:%.2fKB\n", inputFileSize, outputFileSize);
return;
}
inputFileSize /= 1024; outputFileSize /= 1024;
if (inputFileSize < 1024) {
printf("输入文件大小:%.2fMB, 输出文件大小:%.2fMB\n", inputFileSize, outputFileSize);
return;
}
inputFileSize /= 1024; outputFileSize /= 1024;
if (inputFileSize < 1024) {
printf("输入文件大小:%.2fGB, 输出文件大小:%.2fGB\n", inputFileSize, outputFileSize);
return;
}
}
// 判断两个文件是否相等
bool HuffmanEncoderCompress::compare2File(const string& fileName1, const string& fileName2) {
FILE *fp1 = NULL;
if ((fp1 = fopen(fileName1.c_str(), "rb")) == NULL) {
printf("open file %s failed!\n", fileName1.c_str());
exit(-1);
}
FILE *fp2 = NULL;
if ((fp2 = fopen(fileName2.c_str(), "rb")) == NULL) {
printf("open file %s failed!\n", fileName2.c_str());
exit(-1);
}
while (!feof(fp1) && !feof(fp2)) {
unsigned char uch1, uch2;
fread(&uch1, 1, 1, fp1);
fread(&uch2, 1, 1, fp2);
if (uch1 != uch2) return false;
}
if (!feof(fp1) || !feof(fp2)) return false;
return true;
}