欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

java正则表达式匹配网页所有网址和链接文字的示例

程序员文章站 2024-02-26 23:49:46
复制代码 代码如下:import java.io.bufferedreader;import java.io.ioexception;import java.io.inpu...

复制代码 代码如下:

import java.io.bufferedreader;
import java.io.ioexception;
import java.io.inputstreamreader;
import java.net.malformedurlexception;
import java.net.url;
import java.util.arraylist;
import java.util.hashmap;
import java.util.list;
import java.util.regex.matcher;
import java.util.regex.pattern;

import java.net.*;
import java.io.*;
import java.util.regex.*;

/*
根据指定的规则,通过构造正则表达式获取网址
*/

public class urls
{
    private string starturl;                                         //开始采集网址
    string  urlcontent;
    string contentarea;
    private string strareabegin ,strareaend ;            //采集区域开始采集字符串和结束采集字符串
    private string stringinurl,stringnotinurl;       
    string strcontent;//获得的采集内容
    string[] allurls;                                                            //采集到的所有网址
    private string  regex;                                                 //采集规则

    urlandtitle   urlandtitle=new urlandtitle();    //存储网址和标题                   

   
    public static void main(string[] args)
    {
         urls myurl=new urls("<body","/body>");
         myurl.getstarturl("http://www.zuzwn.com/");
         myurl.geturlcontent();
         myurl.getcontentarea();
         myurl.getstarturl("http://www.zuzwn.com/");
         myurl.getstringnotinurl("google");
         myurl.urls();

        //system.out.println("starturl:"+myurl.starturl);
        //system.out.println("urlcontent:"+myurl.urlcontent);
        //system.out.println("contentarea:"+myurl.contentarea);

    }

   
    //初始化构造函数 strareabegin 和strareaend

    public urls (string strareabegin,string strareaend)
    {
        this.strareabegin=strareabegin;
        this.strareaend=strareaend;
  }

  //
    public void urls()
    {
        int i=0;
        //string regex ="<a href="?'?http://[a-za-z0-9]+/.[a-za-z0-9]+/.[a-za-z]+/?[/.?[/s|/s]]+[a>]$";
        string regex ="<a.*?/a>";
         //string regex ="http://.*?>";
        pattern pt=pattern.compile(regex);
        matcher mt=pt.matcher(contentarea);
        while(mt.find())
         {
                 system.out.println(mt.group());
                 i++;

                 //获取标题
                 matcher title=pattern.compile(">.*?</a>").matcher(mt.group());
                 while(title.find())
                 {
                      system.out.println("标题:"+title.group().replaceall(">|</a>",""));
                 }

                 //获取网址
                 matcher myurl=pattern.compile("href=.*?>").matcher(mt.group());
                 while(myurl.find())
                 {
                      system.out.println("网址:"+myurl.group().replaceall("href=|>",""));
                 }

                 system.out.println();

                
         }

        system.out.println("共有"+i+"个符合结果");

    }   

    //获得开始采集网址
    public void getstarturl(string starturl)
    {
        this.starturl=starturl;
    }

    //获得网址所在内容;
    public void geturlcontent()
    {

        stringbuffer is=new stringbuffer();
        try
        {
            url myurl=new url(starturl);
            bufferedreader br= new bufferedreader(
                                                        new inputstreamreader(myurl.openstream()));

            string s;                                               
            while((s=br.readline())!=null)
            {
                is.append(s);
            }                                           
            urlcontent=is.tostring();
        }
    catch(exception e)

    {
        system.out.println("网址文件未能输出");
        e.printstacktrace();
    }

       
    }

   
    //获得网址所在的匹配区域部分
    public void getcontentarea()
    {
         int pos1=0,pos2=0;
         pos1= urlcontent.indexof(strareabegin)+strareabegin.length();
         pos2=urlcontent.indexof(strareaend,pos1);
         contentarea=urlcontent.substring(pos1,pos2);
    }

    //以下两个函数获得网址应该要包含的关键字及不能包含的关键字
    //这里只做初步的实验。后期,保护的关键字及不能包含的关键字应该是不只一个的。
    public void getstringinurl(string stringinurl)
    {
         this.stringinurl=stringinurl;       

    }

    public void getstringnotinurl(string stringnotinurl)
    {
        this.stringnotinurl=stringnotinurl;
    }

    //获取采集规则

    //获取url网址
    public void geturl()
    {

    }

    public string getregex()
    {
        return regex;

    }

    class urlandtitle
    {
        string myurl;
        string title;
    }
}