java抓取网页数据获取网页中所有的链接实例分享
效果图
import java.io.bufferedreader;
import java.io.ioexception;
import java.io.inputstreamreader;
import java.net.httpurlconnection;
import java.net.url;
import java.util.arraylist;
import java.util.regex.matcher;
import java.util.regex.pattern;
public class htmlparser {
/**
* 要分析的网页
*/
string htmlurl;
/**
* 分析结果
*/
arraylist<string> hreflist = new arraylist();
/**
* 网页编码方式
*/
string charset;
public htmlparser(string htmlurl) {
// todo 自动生成的构造函数存根
this.htmlurl = htmlurl;
}
/**
* 获取分析结果
*
* @throws ioexception
*/
public arraylist<string> gethreflist() throws ioexception {
parser();
return hreflist;
}
/**
* 解析网页链接
*
* @return
* @throws ioexception
*/
private void parser() throws ioexception {
url url = new url(htmlurl);
httpurlconnection connection = (httpurlconnection) url.openconnection();
connection.setdooutput(true);
string contenttype = connection.getcontenttype();
charset = getcharset(contenttype);
inputstreamreader isr = new inputstreamreader(
connection.getinputstream(), charset);
bufferedreader br = new bufferedreader(isr);
string str = null, rs = null;
while ((str = br.readline()) != null) {
rs = gethref(str);
if (rs != null)
hreflist.add(rs);
}
}
/**
* 获取网页编码方式
*
* @param str
*/
private string getcharset(string str) {
pattern pattern = pattern.compile("charset=.*");
matcher matcher = pattern.matcher(str);
if (matcher.find())
return matcher.group(0).split("charset=")[1];
return null;
}
/**
* 从一行字符串中读取链接
*
* @return
*/
private string gethref(string str) {
pattern pattern = pattern.compile("<a href=.*</a>");
matcher matcher = pattern.matcher(str);
if (matcher.find())
return matcher.group(0);
return null;
}
public static void main(string[] arg) throws ioexception {
htmlparser a = new htmlparser("http://news.163.com/");
arraylist<string> hreflist = a.gethreflist();
for (int i = 0; i < hreflist.size(); i++)
system.out.println(hreflist.get(i));
}
}