java正则表达式解析html示例分享
package work;
import java.io.bufferedreader;
import java.io.ioexception;
import java.io.inputstream;
import java.io.inputstreamreader;
import java.util.regex.matcher;
import java.util.regex.pattern;
import org.apache.commons.httpclient.defaulthttpmethodretryhandler;
import org.apache.commons.httpclient.httpclient;
import org.apache.commons.httpclient.httpexception;
import org.apache.commons.httpclient.httpstatus;
import org.apache.commons.httpclient.methods.getmethod;
import org.apache.commons.httpclient.params.httpmethodparams;
public class chuanboyi {
public static void main(string[] args){
// todo auto-generated method stub
stringbuffer html = new stringbuffer();
httpclient httpclient = new httpclient();
//创建get方法实例
getmethod getmethod = new getmethod("//www.jb51.net");
//使用系统提供的默认恢复策略
getmethod.getparams().setparameter(httpmethodparams.retry_handler, new defaulthttpmethodretryhandler());
try{
//执行get方法
int statuscode = httpclient.executemethod(getmethod);
if(statuscode != httpstatus.sc_ok){
system.out.println("method is wrong " + getmethod.getstatusline());
}
inputstream responsebody = getmethod.getresponsebodyasstream();
bufferedreader reader = new bufferedreader(new inputstreamreader(responsebody,"utf-8"));
string line = reader.readline();
while(line != null){
html.append(line).append("\n");
line = reader.readline();
}
reader.close();
//正则表达式
string regex = "<form name=\"compareform\"[\\s\\s]+>[\\s\\s]+</form>.*<script.*>";
string regexa ="(?<=<li>)[\\s\\s]+?(?=</li>)";
pattern pattern = pattern.compile(regex);
matcher m = pattern.matcher(html);
stringbuffer str = new stringbuffer();
int i = 0;
while(m.find()){
str.append(m.group());
}
pattern = pattern.compile(regexa);
m = pattern.matcher(str);
while(m.find()){
attrs(m.group());
i++;
}
system.out.println("共有"+i+"条数据!");
}catch (httpexception e) {
// todo: handle exception
system.out.println("please check your provided http address!");
e.printstacktrace();
}catch (ioexception e) {
// todo: handle exception
system.out.println("the line is wrong!");
e.printstacktrace();
}finally{
getmethod.releaseconnection();//释放链接
}
}
public static void attrs(string str){
//获取url的正则表达式
string regexurl = "[a-z]+-[0-9]+\\.html";
//获取name的正则表达式
string regexname = "(?<=title=\")[[\\w-\\s][^x00-xff]]+(?=\")";
//获取图片的正则表达式
string regexpicture = "images.*\\.jpg";
pattern patternurl = pattern.compile(regexurl);
pattern patternname = pattern.compile(regexname);
pattern patternpicture = pattern.compile(regexpicture);
matcher murl = patternurl.matcher(str);
matcher mname = patternname.matcher(str);
matcher mpicture = patternpicture.matcher(str);
if(mname.find()){
system.out.println("名字:"+mname.group());
}
if(murl.find()){
system.out.println("链接:"+murl.group());
}
if(mpicture.find()){
system.out.println("图片:"+mpicture.group());
}
}
}
上一篇: MySQL查询优化的5个实用技巧
下一篇: 详解Java适配器模式