如何用Java实现啥夫曼编码
大家可能会想,程序和第三方提供了很多压缩方式,何必自己写压缩代码呢?不错,如gzip这样的压缩工具很多,可是在某些情况下(如文本内容小且字符不重复),gzip压缩后会比原始文本还要大。所以在某些特殊情况下用自己的压缩方式可以更优。
大家可能早已忘记了在学校学习的哈夫曼知识,可以先在百度百科了解一下哈夫曼知识:
哈夫曼思想:统计文本字符重复率,求出各字符权值,再构造出一颗最优二叉树(又称哈夫曼树),然后给每个叶子结点生成一个以位(bit)为单位的码值,每个码值不能做为其 他码值的前缀,再将码值合并以每8个生成一个字节。
package com.huffman;
/**
* 结点
* @author davee
*/
public class node implements comparable<node> {
int weight;//权值
node leftchild;//左孩子结点
node rightchild;//右孩子结点
string huffcode;
private boolean isleaf;//是否是叶子
character value;
public node(character value, int weight) {
this.value = value;
this.weight = weight;
this.isleaf = true;
}
public node(int weight, node leftchild, node rightchild) {
this.weight = weight;
this.leftchild = leftchild;
this.rightchild = rightchild;
}
public void increaseweight(int i) {
weight += i;
}
public boolean isleaf() {
return isleaf;
}
@override
public int compareto(node o) {
return this.weight - o.weight;
}
}
package com.huffman;
import java.math.biginteger;
import java.util.arraylist;
import java.util.collections;
import java.util.hashmap;
import java.util.map;
import java.util.treemap;
public class huffmantree {
private boolean debug = false;
private hashmap<character, node> nodemap;
private arraylist<node> nodelist;
public huffmantree() {
nodemap = new hashmap<character, node>();
nodelist = new arraylist<node>();
}
public void setdebug(boolean debug) {
this.debug = debug;
}
public string decode(map<string, character> codetable, string binary) {
int begin = 0, end = 1, count = binary.length();
stringbuffer sb = new stringbuffer();
while (end <= count) {
string key = binary.substring(begin, end);
if (codetable.containskey(key)) {
sb.append(codetable.get(key));
begin = end;
} else {
}
end++;
}
return sb.tostring();
}
public string encode(string origintext) {
if (origintext == null) return null;
calculateweight(origintext);
// if (debug) printnodes(nodelist);
node root = generatehuffmantree(nodelist);
generatehuffmancode(root, "");
if (debug) printnodes(root);
stringbuffer sb = new stringbuffer();
for (character key : origintext.tochararray()) {
sb.append(nodemap.get(key).huffcode);
}
if (debug) system.out.println("二进制:"+sb.tostring());
return sb.tostring();
}
/**
* 计算叶子权值
* @param text
*/
private void calculateweight(string text) {
for (character c : text.tochararray()) {
if (nodemap.containskey(c)) {
nodemap.get(c).increaseweight(1);//权值加1
} else {
node leafnode = new node(c, 1);
nodelist.add(leafnode);
nodemap.put(c, leafnode);
}
}
}
/**
* 生成哈夫曼树
* @param nodes
*/
private node generatehuffmantree(arraylist<node> nodes) {
collections.sort(nodes);
while(nodes.size() > 1) {
node ln = nodes.remove(0);
node rn = nodes.remove(0);
insertsort(nodes, new node(ln.weight + rn.weight, ln, rn));
}
node root = nodes.remove(0);
nodes = null;
return root;
}
/**
* 插入排序
* @param sortednodes
* @param node
*/
private void insertsort(arraylist<node> sortednodes, node node) {
if (sortednodes == null) return;
int weight = node.weight;
int min = 0, max = sortednodes.size();
int index;
if (sortednodes.size() == 0) {
index = 0;
} else if (weight < sortednodes.get(min).weight) {
index = min;//插入到第一个
} else if (weight >= sortednodes.get(max-1).weight) {
index = max;//插入到最后
} else {
index = max/2;
for (int i=0, count=max/2; i<=count; i++) {
if (weight >= sortednodes.get(index-1).weight && weight < sortednodes.get(index).weight) {
break;
} else if (weight < sortednodes.get(index).weight) {
max = index;
} else {
min = index;
}
index = (max + min)/2;
}
}
sortednodes.add(index, node);
}
private void generatehuffmancode(node node, string code) {
if (node.isleaf()) node.huffcode = code;
else {
generatehuffmancode(node.leftchild, code + "0");
generatehuffmancode(node.rightchild, code + "1");
}
}
/**
* 生成码表
* @return
*/
public map<string, character> getcodetable() {
map<string, character> map = new hashmap<string, character>();
for (node node : nodemap.values()) {
map.put(node.huffcode, node.value);
}
return map;
}
/**
* 打印节点信息
* @param root
*/
private void printnodes(node root) {
system.out.println("字符 权值 哈夫码");
printtree(root);
}
private void printtree(node root) {
if (root.isleaf()) system.out.println((root.value == null ? " " : root.value)+" "+root.weight+" "+(root.huffcode == null ? "" : root.huffcode));
if (root.leftchild != null) printtree(root.leftchild);
if (root.rightchild != null) printtree(root.rightchild);
}
/**
* 打印节点信息
* @param nodes
*/
private void printnodes(arraylist<node> nodes) {
system.out.println("字符 权值 哈夫码");
for (node node : nodes) {
system.out.println(node.value+" "+node.weight+" "+node.huffcode);
}
}
}
package com.test;
import java.util.map;
import com.huffman.huffutils;
import com.huffman.huffmantree;
public class test {
public static void main(string[] args) {
string origintext = "abcdacaha";
huffmantree huffmantree = new huffmantree();
huffmantree.setdebug(true);//测试
string binary = huffmantree.encode(origintext);
byte[] bytes = huffutils.binary2bytes(binary);
map<string, character> codetable = huffmantree.getcodetable();
int lastbytenum = binary.length() % 8;
system.out.println(bytes.length);
//将bytes、codetable、 lastbytenum传递到服务器端
//省略。。。。。。
/*
服务器端解析
接收到参数,并转换成bytes、relationmap、 lastbytenum
*/
string fullbinary = huffutils.bytes2binary(bytes, lastbytenum);
system.out.println("服务器二进制:"+fullbinary);
string retrievetext = huffmantree.decode(codetable, fullbinary);
system.out.println("恢复文本:"+retrievetext);
}
}