文件压缩
程序员文章站
2022-07-12 17:14:39
...
项目名称:文件压缩
开发环境:window8.1 ——VS2013
使用对象:文本文件(.txt)
开发语言:C/C++
使用工具:VS2013 、 Beyond compare、Linux下md5sum
项目技术:Huffman树,I/O流,模板,String容器,文件操作等
项目描述:
<1>.文件压缩过程:先统计文件中每个ASSCII码出现的次数,采用贪心算法,用文件中ASCII码出现的次数构建Huffman树,生成每个ASCII所对应的Huffman编码,然后将原文件中的ASCII码,替换成Huffman编码,并将文件中出现过的所有ASCII及其次数存入压缩文件起始位置,从而降低文件的大小。
<2>.文件解压过程:读取压缩文件头部所存的ASCII及其次数,用其生成Huffman树,将Huffman编码还原成ASCII码。
项目概略图:
项目测试数据:
项目性能分析:
VS2013环境下测试,该项目平均压缩率在78%左右,Release版本下,每秒可压缩文件5M。
项目中所遇到的问题:
1 . 程序崩于文件统计次数,无法完成统计
2 . 文件压缩过程进去死循环,无法正常完成压缩
3 . 文件越压越大
4 . 文件无法正常完成解压缩
5 . 对于只有一种字符的文件无法压缩
项目模块分析图:
模拟生成Huffaman树
模拟生成Huffman编码
Huffman编码存储
项目总结
经多次多项测试,采用加权算法求得平均值,本项目可以针对文本文件进行压缩,压缩率可达78%,每秒可压缩文件5M。
项目源码:
/****************** "HuffmanTree.h" *********************/
#pragma once
#include <iostream>
#include <vector>
#include <queue>
using namespace std;
template <class W>
struct HuffmanTreeNode {
W _w;//权值 次数
HuffmanTreeNode<W>* _left;//左孩子
HuffmanTreeNode<W>* _right;//右孩子
HuffmanTreeNode<W>* _parent;//父亲
HuffmanTreeNode(const W& w)
:_left(NULL)
, _right(NULL)
, _w(w)
, _parent(NULL)
{}
};
//构造哈夫曼树
template <class W>
class HuffmanTree {
typedef HuffmanTreeNode<W> Node;
struct PtrNodeCompare{
bool operator()(Node* left, Node* right){
return left->_w > right->_w;
}
};
public:
HuffmanTree()
:_root(NULL)
{}
HuffmanTree(W* w, size_t n, const W& invalid){
priority_queue <Node*, vector<Node*>, PtrNodeCompare> minheap;
for (size_t i = 0; i < n; ++i){
if (w[i] != invalid){
minheap.push(new Node(w[i]));
}
}
if (minheap.size() == 0){
cout << "无效文件" << endl;
exit(0);
}
else if (minheap.size() == 1){
Node* right = minheap.top();
_root = new Node(right->_w);
right->_parent = _root;
_root->_right = right;
}
else{
while (minheap.size() > 1){
Node* left = minheap.top();
minheap.pop();
Node* right = minheap.top();
minheap.pop();
Node* parent = new Node(left->_w + right->_w);
parent->_left = left;
parent->_right = right;
left->_parent = parent;
right->_parent = parent;
minheap.push(parent);
}
_root = minheap.top();
}
}
Node* GetRoot(){
return _root;
}
~HuffmanTree(){
Destory(_root);
_root = NULL;
}
void Destory(Node* root){
if (root == NULL){
return;
}
Destory(root->_left);
Destory(root->_right);
delete root;
}
private:
HuffmanTree(const HuffmanTree<W>& t);
HuffmanTree<W>& operator= (const HuffmanTree<W>& t);
protected:
Node* _root;
};
/******************** "FileCompress.h" **********************/
#pragma once
#include <string>
#include <stdio.h>
#include <windows.h>
#include <fstream>
#include <algorithm>
#include <assert.h>
#include "HuffmanTree.h"
#pragma warning (disable: 4996)
using namespace std;
typedef long long LongType;
struct CharInfo{
char _ASCII; //字符
LongType _count;//次数
string _code;//权值
CharInfo operator+(const CharInfo& info){
CharInfo ret;
ret._count = _count + info._count;
return ret;
}
bool operator>(const CharInfo& info){
return _count > info._count;
}
bool operator<(const CharInfo& info){
return _count < info._count;
}
bool operator!=(const CharInfo& info){
return _count != info._count;
}
bool operator==(const CharInfo& info){
return _count == info._count;
}
};
class FileCompress{
typedef HuffmanTreeNode<CharInfo> Node;
struct TmpInfo{
char _ch; //字符
LongType _num;//数量
};
public:
FileCompress(){
for (size_t i = 0; i < 256; ++i){
_infos[i]._ASCII = i;
_infos[i]._count = 0;
}
}
//1.统计字符出现的次数
void Compress(const char* file){
FILE* f_out = fopen(file, "r");
assert(f_out);
char character = fgetc(f_out);
while (character != EOF){
_infos[(unsigned char)character]._count++;
character = fgetc(f_out);
}
//2.构建huffman tree
CharInfo invalid;
invalid._count = 0;
HuffmanTree<CharInfo> tree(_infos, 256, invalid);
//3.生成code
GenerateHuffmanCode(tree.GetRoot());
//4.压缩
string compressFile = file;
compressFile += ".huffman";
FILE* f_in = fopen(compressFile.c_str(), "wb");
for (size_t i = 0; i < 256; ++i){
if (_infos[i]._count > 0){
TmpInfo tmpinfo;
tmpinfo._ch = _infos[i]._ASCII; //字符
tmpinfo._num = _infos[i]._count;//次数
//解压时需要用其生成Huffman树
fwrite(&tmpinfo, sizeof(TmpInfo), 1, f_in);
}
}
TmpInfo tmpinfo;
tmpinfo._num = -1;
fwrite(&tmpinfo, sizeof(TmpInfo), 1, f_in);
rewind(f_out);
character = fgetc(f_out);
size_t CeffectiveBit = 0;
char value = 0;
int pos = 0;
while (character != EOF){
string& code = _infos[(unsigned char)character]._code;
for (size_t i = 0; i < code.size(); ++i){
if (code[i] == '0'){
value &= ~(1 << pos);
}
else if (code[i] == '1'){
value |= (1 << pos);
}
else{
assert(false);
}
++pos;
if (pos == 8){
fputc(value, f_in);
value = 0;
pos = 0;
}
++CeffectiveBit;
}
character = fgetc(f_out);
}
cout << "压缩Huffman编码长度:" << CeffectiveBit<< endl;
if (pos > 0){
fputc(value, f_in);
}
fclose(f_out);
fclose(f_in);
}
void GenerateHuffmanCode(Node* root){
if (root == NULL){
return;
}
if (root->_left == NULL && root->_right == NULL) {
string code;
Node* cur = root;
Node* parent = cur->_parent;
while (parent) {
if (cur == parent->_left){
code.push_back('0');
}
else{
code.push_back('1');
}
cur = parent;
parent = cur->_parent;
}
reverse(code.begin(), code.end());
_infos[(unsigned char)root->_w._ASCII]._code = code;
return;
}
GenerateHuffmanCode(root->_left);
GenerateHuffmanCode(root->_right);
}
/************************** 解压缩 *******************************/
void Uncompress(const char* file){
assert(file);
string uncompressFile = file;
size_t pos = uncompressFile.rfind('.');
assert(pos != string::npos);
uncompressFile.erase(pos, uncompressFile.size() - pos);
uncompressFile += ".unhuffman";//测试所用 分辨与源文件的区别
FILE* fout = fopen(file, "rb");
assert(fout);
FILE* fin = fopen(uncompressFile.c_str(), "w");
assert(fin);
TmpInfo info;
fread(&info, sizeof(TmpInfo), 1, fout);
while (info._num != -1){
_infos[(unsigned char)info._ch]._ASCII = info._ch;
_infos[(unsigned char)info._ch]._count = info._num;
fread(&info, sizeof(TmpInfo), 1, fout);
}
//1.重建huffman tree
size_t n = 0;
CharInfo invalid;
invalid._count = 0;
HuffmanTree<CharInfo> tree(_infos, 256, invalid);
Node* root = tree.GetRoot();
Node* cur = root;
LongType fileSize = root->_w._count;
cout <<"文件中ASCII个数:" <<fileSize << endl;
//2.解压缩
char value = fgetc(fout);
while (1){
for (size_t pos = 0; pos < 8; ++pos){
if (value & (1 << pos)){
cur = cur->_right;
}
else{
cur = cur->_left;
}
++n;
if (NULL == cur->_left && NULL == cur->_right){
fputc(cur->_w._ASCII, fin);
cur = root;
if (--fileSize == 0){
break;
}
}
}
if (fileSize == 0){
break;
}
value = fgetc(fout);
}
cout <<"解压Huffman编码长度:"<< n << endl;
fclose(fout);
fclose(fin);
}
protected:
CharInfo _infos[256];
};//类结束
/******************* Test.cpp *********************/
#include "FileCompress.h"
void TestFilecompress()
{
FileCompress fc;
size_t start = GetTickCount();
fc.Compress("Input.txt");
size_t end = GetTickCount();
cout << "UseTime:" << end - start << endl;
fc.Uncompress("Input.txt.huffman");
}
int main(){
TestFilecompress();
system("pause");
return 0;
}
上一篇: Trie树 (状压DP)
下一篇: 关于字体压缩的方案