欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

简单爬虫程序

程序员文章站 2022-05-04 11:21:30
...

因为以后信息检索要做个作业,今天正好有空,写了个简单的爬虫,真是不能再简单了。先爬了1000个网页,留作以后处理。

接口:WebPage.java

import java.io.File;
import java.net.MalformedURLException;

/*
 * 定义了WebPage对象的基本操作
 */
public interface WebPage
{
	/**根据网页地址将该网页转换成本地文件*/
	public File getPageFile();
	
	/**分析网页的内容
	 * @throws MalformedURLException */
	public void parse() throws MalformedURLException;
}

 实现以测试类:HTMLPage.java

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
 * 简单的蜘蛛爬虫,运行起来异常会很多,暂时先不管
 * @author gbk
 *
 */
public class HTMLPage extends Thread implements WebPage
{
	private static int pageId = 0;
	private static int MAX_PAGENUM = 1000;
	//存放处理过的URL,保证不重复
	private static Set<String> urls = new HashSet<String>();
	private File localFile;
	private StringBuffer contents;
	private URL url;

	public HTMLPage(URL url)
	{
		this.url = url;
	}
	
	/**
	 * 将网页下载到本地,用来以后分析
	 */
	public File getPageFile()
	{
		int ch = 0;
		contents = new StringBuffer();
		pageId++;
		localFile = new File("d:/html/"+pageId+".txt");
		try
		{
			InputStream inputStream = url.openStream();
			InputStreamReader inputStreamReader = new InputStreamReader(inputStream);
			FileOutputStream fileOutputStream = new FileOutputStream(localFile);
			OutputStreamWriter outputStreamWriter = new OutputStreamWriter(fileOutputStream);
			while((ch = inputStreamReader.read()) != -1) 
			{ 
				contents.append((char)ch);
				outputStreamWriter.write(ch); 
			} 
			outputStreamWriter.close();
			fileOutputStream.close();
			inputStreamReader.close();
			inputStream.close();
		} catch (FileNotFoundException e)
		{
			e.printStackTrace();
		} catch (IOException e)
		{
			e.printStackTrace();
		}
		return localFile;
	}
	
    /**
     * 分析网页,将不重复的url地址添加到候选叶中
     */
	public void parse() throws MalformedURLException
	{
        //无法处理内部链接,即不带http
		String regex ="<a.*?href=http://.*?>.*?</a>";
        Pattern pt=Pattern.compile(regex,Pattern.CASE_INSENSITIVE);
        Matcher mt=pt.matcher(contents);
        while(mt.find())
        {         
            //获取网址
            Matcher myurl=Pattern.compile("href=.*?>").matcher(mt.group()); 
            while(myurl.find())
            {
            	String url = myurl.group().replaceAll("href=|>","");
            	//没有做同步,所以最后会稍微多出几个文件
            	if(!urls.contains(url)&&pageId<MAX_PAGENUM)
            	{
            		urls.add(url);
            		//新建一个线程,重复上述操作
            		HTMLPage page = new HTMLPage(new URL(url));
            		page.start();
            	}
            }
            System.out.println();         
        }
	}
	
	public void run()
	{
	    getPageFile();
		try
		{
			parse();
		} catch (MalformedURLException e)
		{
			e.printStackTrace();
		}
	}
	
	public static void main(String[] args) throws MalformedURLException
	{
		HTMLPage page = new HTMLPage(new URL("http://www.baidu.com"));
		page.start();
	}
}