java使用htmlparser提取网页纯文本例子
package com.test;
import org.htmlparser.node;
import org.htmlparser.nodefilter;
import org.htmlparser.parser;
import org.htmlparser.filters.tagnamefilter;
import org.htmlparser.tags.tabletag;
import org.htmlparser.util.nodelist;
/**
* 标题:利用htmlparser提取网页纯文本的例子
*/
public class testhtmlparser {
public static void testhtml() {
try {
string scurrentline;
string stotalstring;
scurrentline = "";
stotalstring = "";
java.io.inputstream l_urlstream;
java.net.url l_url = new java.net.url("http://www.ideagrace.com/html/doc/2006/07/04/00929.html");
java.net.httpurlconnection l_connection = (java.net.httpurlconnection) l_url.openconnection();
l_connection.connect();
l_urlstream = l_connection.getinputstream();
java.io.bufferedreader l_reader = new java.io.bufferedreader(new java.io.inputstreamreader(l_urlstream));
while ((scurrentline = l_reader.readline()) != null) {
stotalstring += scurrentline+"/r/n";
// system.out.println(stotalstring);
}
string testtext = extracttext(stotalstring);
system.out.println( testtext );
} catch (exception e) {
e.printstacktrace();
}
}
public static string extracttext(string inputhtml) throws exception {
stringbuffer text = new stringbuffer();
parser parser = parser.createparser(new string(inputhtml.getbytes(),"gbk"), "gbk");
// 遍历所有的节点
nodelist nodes = parser.extractallnodesthatmatch(new nodefilter() {
public boolean accept(node node) {
return true;
}
});
system.out.println(nodes.size()); //打印节点的数量
for (int i=0;i<nodes.size();i++){
node nodet = nodes.elementat(i);
//system.out.println(nodet.gettext());
text.append(new string(nodet.toplaintextstring().getbytes("gbk"))+"/r/n");
}
return text.tostring();
}
public static void test5(string resource) throws exception {
parser myparser = new parser(resource);
myparser.setencoding("gbk");
string filterstr = "table";
nodefilter filter = new tagnamefilter(filterstr);
nodelist nodelist = myparser.extractallnodesthatmatch(filter);
tabletag tabletag = (tabletag) nodelist.elementat(11);
}
public static void main(string[] args) throws exception {
// test5("http://www.google.com");
testhtml();
}
}