欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

文件压缩

程序员文章站 2022-07-12 17:14:39
...

项目名称:文件压缩

开发环境:window8.1 ——VS2013

使用对象:文本文件(.txt)

开发语言:C/C++

使用工具:VS2013 、 Beyond compare、Linux下md5sum

项目技术:Huffman树,I/O流,模板,String容器,文件操作等

项目描述
<1>.文件压缩过程:先统计文件中每个ASSCII码出现的次数,采用贪心算法,用文件中ASCII码出现的次数构建Huffman树,生成每个ASCII所对应的Huffman编码,然后将原文件中的ASCII码,替换成Huffman编码,并将文件中出现过的所有ASCII及其次数存入压缩文件起始位置,从而降低文件的大小。
<2>.文件解压过程:读取压缩文件头部所存的ASCII及其次数,用其生成Huffman树,将Huffman编码还原成ASCII码。


项目概略图:


文件压缩


项目测试数据:


文件压缩


项目性能分析:


VS2013环境下测试,该项目平均压缩率在78%左右,Release版本下,每秒可压缩文件5M。


项目中所遇到的问题:


1 . 程序崩于文件统计次数,无法完成统计

2 . 文件压缩过程进去死循环,无法正常完成压缩

3 . 文件越压越大

4 . 文件无法正常完成解压缩

5 . 对于只有一种字符的文件无法压缩


项目模块分析图:


模拟生成Huffaman树
文件压缩


模拟生成Huffman编码
文件压缩


Huffman编码存储


文件压缩


项目总结


经多次多项测试,采用加权算法求得平均值,本项目可以针对文本文件进行压缩,压缩率可达78%,每秒可压缩文件5M。


项目源码:

构思源码链接


/****************** "HuffmanTree.h" *********************/
#pragma once

#include <iostream>
#include <vector>
#include <queue>
using namespace std;

template <class W>
struct HuffmanTreeNode {

    W _w;//权值 次数
    HuffmanTreeNode<W>* _left;//左孩子
    HuffmanTreeNode<W>* _right;//右孩子
    HuffmanTreeNode<W>* _parent;//父亲

    HuffmanTreeNode(const W& w)
        :_left(NULL)
        , _right(NULL)
        , _w(w)
        , _parent(NULL)
    {}

};

//构造哈夫曼树
template <class W>
class HuffmanTree {

    typedef HuffmanTreeNode<W> Node;

    struct PtrNodeCompare{

        bool operator()(Node* left, Node* right){
            return left->_w > right->_w;
        }
    };

public:
    HuffmanTree()
        :_root(NULL)
    {}

    HuffmanTree(W* w, size_t n, const W& invalid){

        priority_queue <Node*, vector<Node*>, PtrNodeCompare> minheap;
        for (size_t i = 0; i < n; ++i){
            if (w[i] != invalid){
                minheap.push(new Node(w[i]));
            }
        }

        if (minheap.size() == 0){

            cout << "无效文件" << endl;
            exit(0);
        }
        else if (minheap.size() == 1){
            Node* right = minheap.top();
            _root = new Node(right->_w);
            right->_parent = _root;
            _root->_right = right;
        }
        else{
            while (minheap.size() > 1){

                Node* left = minheap.top();
                minheap.pop();

                Node* right = minheap.top();
                minheap.pop();

                Node* parent = new Node(left->_w + right->_w);

                parent->_left = left;
                parent->_right = right;

                left->_parent = parent;
                right->_parent = parent;

                minheap.push(parent);
            }

            _root = minheap.top();
        }
    }

    Node* GetRoot(){
        return _root;
    }

    ~HuffmanTree(){
        Destory(_root);
        _root = NULL;
    }

    void Destory(Node* root){
        if (root == NULL){
            return;
        }
        Destory(root->_left);
        Destory(root->_right);
        delete root;
    }

private:
    HuffmanTree(const HuffmanTree<W>& t); 
    HuffmanTree<W>& operator= (const HuffmanTree<W>& t); 

protected:
    Node* _root;
};

/******************** "FileCompress.h" **********************/

#pragma once

#include <string>
#include <stdio.h>
#include <windows.h>
#include <fstream>
#include <algorithm>
#include <assert.h>
#include "HuffmanTree.h"
#pragma warning (disable: 4996)
using namespace std;


typedef long long LongType;

struct CharInfo{

    char _ASCII; //字符
    LongType _count;//次数
    string _code;//权值   

    CharInfo operator+(const CharInfo& info){
        CharInfo ret;
        ret._count = _count + info._count;
        return ret;
    }

    bool operator>(const CharInfo& info){
        return _count > info._count;
    }

    bool operator<(const CharInfo& info){
        return _count < info._count;
    }

    bool operator!=(const CharInfo& info){
        return _count != info._count;
    }

    bool operator==(const CharInfo& info){
        return _count == info._count;
    }

};



class FileCompress{

    typedef HuffmanTreeNode<CharInfo> Node;

    struct TmpInfo{
        char _ch; //字符
        LongType _num;//数量
    };

public:
    FileCompress(){
        for (size_t i = 0; i < 256; ++i){
            _infos[i]._ASCII = i;
            _infos[i]._count = 0;
        }
    }

    //1.统计字符出现的次数
    void Compress(const char* file){

        FILE* f_out = fopen(file, "r");
        assert(f_out);

        char character = fgetc(f_out);

        while (character != EOF){
            _infos[(unsigned char)character]._count++; 
            character = fgetc(f_out);
        }

        //2.构建huffman tree

        CharInfo invalid;
        invalid._count = 0;
        HuffmanTree<CharInfo> tree(_infos, 256, invalid);

        //3.生成code
        GenerateHuffmanCode(tree.GetRoot());

        //4.压缩
        string compressFile = file;  
        compressFile += ".huffman";

        FILE* f_in = fopen(compressFile.c_str(), "wb");

        for (size_t i = 0; i < 256; ++i){
            if (_infos[i]._count > 0){
                TmpInfo tmpinfo;
                tmpinfo._ch = _infos[i]._ASCII; //字符
                tmpinfo._num = _infos[i]._count;//次数

                //解压时需要用其生成Huffman树 
                fwrite(&tmpinfo, sizeof(TmpInfo), 1, f_in);
            }
        }

        TmpInfo tmpinfo;
        tmpinfo._num = -1;
        fwrite(&tmpinfo, sizeof(TmpInfo), 1, f_in);
        rewind(f_out);

        character = fgetc(f_out); 
        size_t CeffectiveBit = 0;
        char value = 0;
        int pos = 0;

        while (character != EOF){

            string& code = _infos[(unsigned char)character]._code;
            for (size_t i = 0; i < code.size(); ++i){

                if (code[i] == '0'){
                    value &= ~(1 << pos);
                }
                else if (code[i] == '1'){
                    value |= (1 << pos);
                }
                else{
                    assert(false);
                }

                ++pos;

                if (pos == 8){
                    fputc(value, f_in);
                    value = 0;
                    pos = 0;
                }
                ++CeffectiveBit;

            }
            character = fgetc(f_out); 
        }
        cout << "压缩Huffman编码长度:" << CeffectiveBit<< endl;

        if (pos > 0){
            fputc(value, f_in);
        }

        fclose(f_out);
        fclose(f_in);
    }


    void GenerateHuffmanCode(Node* root){
        if (root == NULL){
            return;
        }
        if (root->_left == NULL && root->_right == NULL)  {

            string code; 
            Node* cur = root;
            Node* parent = cur->_parent;

            while (parent) {
                if (cur == parent->_left){
                    code.push_back('0');
                }
                else{
                    code.push_back('1');
                }
                cur = parent;
                parent = cur->_parent;
            }

            reverse(code.begin(), code.end());

            _infos[(unsigned char)root->_w._ASCII]._code = code;
            return;
        }
        GenerateHuffmanCode(root->_left);
        GenerateHuffmanCode(root->_right);
    }

    /**************************   解压缩   *******************************/

    void Uncompress(const char* file){
        assert(file);

        string uncompressFile = file;
        size_t pos = uncompressFile.rfind('.');    
        assert(pos != string::npos);

        uncompressFile.erase(pos, uncompressFile.size() - pos);
        uncompressFile += ".unhuffman";//测试所用  分辨与源文件的区别

        FILE* fout = fopen(file, "rb");
        assert(fout);

        FILE* fin = fopen(uncompressFile.c_str(), "w");
        assert(fin);

        TmpInfo info;
        fread(&info, sizeof(TmpInfo), 1, fout);

        while (info._num != -1){
            _infos[(unsigned char)info._ch]._ASCII = info._ch;
            _infos[(unsigned char)info._ch]._count = info._num;
            fread(&info, sizeof(TmpInfo), 1, fout);
        }

        //1.重建huffman tree 
        size_t n = 0;
        CharInfo invalid;
        invalid._count = 0;
        HuffmanTree<CharInfo> tree(_infos, 256, invalid);
        Node* root = tree.GetRoot();
        Node* cur = root;
        LongType fileSize = root->_w._count;
        cout <<"文件中ASCII个数:" <<fileSize << endl;

        //2.解压缩
        char value = fgetc(fout);
        while (1){
            for (size_t pos = 0; pos < 8; ++pos){

                if (value & (1 << pos)){
                    cur = cur->_right;
                }
                else{
                    cur = cur->_left;
                }

                ++n;

                if (NULL == cur->_left && NULL == cur->_right){

                    fputc(cur->_w._ASCII, fin); 
                    cur = root;

                    if (--fileSize == 0){
                        break;
                    }
                }

            }
            if (fileSize == 0){
                break;
            }
            value = fgetc(fout);
        }
        cout <<"解压Huffman编码长度:"<< n << endl;

        fclose(fout);
        fclose(fin);
    }




protected:

    CharInfo _infos[256];

};//类结束

/******************* Test.cpp *********************/

#include "FileCompress.h"


void TestFilecompress()
{
    FileCompress fc;

    size_t start = GetTickCount();
    fc.Compress("Input.txt");
    size_t end = GetTickCount();
    cout << "UseTime:" << end - start << endl;

    fc.Uncompress("Input.txt.huffman");
}

int main(){

    TestFilecompress();

    system("pause");
    return 0;
}

相关标签: 文件压缩