欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

java正则表达式解析html示例分享

程序员文章站 2024-02-25 10:40:59
复制代码 代码如下:package work; import java.io.bufferedreader;import java.io.ioexception;impo...

复制代码 代码如下:

package work;

import java.io.bufferedreader;
import java.io.ioexception;
import java.io.inputstream;
import java.io.inputstreamreader;
import java.util.regex.matcher;
import java.util.regex.pattern;

import org.apache.commons.httpclient.defaulthttpmethodretryhandler;
import org.apache.commons.httpclient.httpclient;
import org.apache.commons.httpclient.httpexception;
import org.apache.commons.httpclient.httpstatus;
import org.apache.commons.httpclient.methods.getmethod;
import org.apache.commons.httpclient.params.httpmethodparams;

public class chuanboyi {

 public static void main(string[] args){
  // todo auto-generated method stub
  stringbuffer html = new stringbuffer();
  httpclient httpclient = new httpclient();
  //创建get方法实例
  getmethod getmethod = new getmethod("//www.jb51.net");
  //使用系统提供的默认恢复策略
  getmethod.getparams().setparameter(httpmethodparams.retry_handler, new defaulthttpmethodretryhandler());
  try{
   //执行get方法
   int statuscode = httpclient.executemethod(getmethod);
   if(statuscode != httpstatus.sc_ok){
    system.out.println("method is wrong " + getmethod.getstatusline());
   }
   inputstream responsebody = getmethod.getresponsebodyasstream();
   bufferedreader reader = new bufferedreader(new inputstreamreader(responsebody,"utf-8"));
   string line = reader.readline();
   while(line != null){
    html.append(line).append("\n");
    line = reader.readline();
   }
   reader.close();
   //正则表达式
   string regex = "<form name=\"compareform\"[\\s\\s]+>[\\s\\s]+</form>.*<script.*>";
   string regexa ="(?<=<li>)[\\s\\s]+?(?=</li>)";
   pattern pattern = pattern.compile(regex);
         matcher m = pattern.matcher(html);
         stringbuffer str = new stringbuffer();
         int i = 0;
         while(m.find()){
          str.append(m.group());
         }
         pattern = pattern.compile(regexa);
         m = pattern.matcher(str);
         while(m.find()){
          attrs(m.group());
          i++;
         }
         system.out.println("共有"+i+"条数据!");
  }catch (httpexception e) {
   // todo: handle exception
   system.out.println("please check your provided http address!");
   e.printstacktrace();
  }catch (ioexception e) {
   // todo: handle exception
   system.out.println("the line is wrong!");
   e.printstacktrace();
  }finally{
   getmethod.releaseconnection();//释放链接
  }
 }
 public static void attrs(string str){

  //获取url的正则表达式
  string regexurl = "[a-z]+-[0-9]+\\.html";
  //获取name的正则表达式
  string regexname = "(?<=title=\")[[\\w-\\s][^x00-xff]]+(?=\")";
  //获取图片的正则表达式
  string regexpicture = "images.*\\.jpg";

  pattern patternurl = pattern.compile(regexurl);
  pattern patternname = pattern.compile(regexname);
  pattern patternpicture = pattern.compile(regexpicture);
  matcher murl = patternurl.matcher(str);
  matcher mname = patternname.matcher(str);
  matcher mpicture = patternpicture.matcher(str);
  if(mname.find()){
   system.out.println("名字:"+mname.group());
  }
  if(murl.find()){
   system.out.println("链接:"+murl.group());
  }
  if(mpicture.find()){
   system.out.println("图片:"+mpicture.group());
  }
 } 
}