定向网站爬虫---初级例子 博客分类: javalucene数据结构和算法
程序员文章站
2024-03-26 08:39:23
...
1:url处理和html解析
package com.xiaoshuo.util; import java.util.ArrayList; import java.util.List; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.xiaoshuo.to.Chapter; import com.xiaoshuo.to.UrlTO; /** * 解析html的处理类 * @author lijunqing * */ public class PaserUrlUtil { private HttpClient httpClient=new DefaultHttpClient(); /** * 获得html的string字符串 * @param url * @return * @throws Exception */ public String getHtmlStr(String url) throws Exception { HttpGet httpGet=new HttpGet(url); HttpResponse response; String htmlStr=null; try { response=httpClient.execute(httpGet); HttpEntity entity=response.getEntity(); if(entity != null) { htmlStr=new String(EntityUtils.toString(entity)); htmlStr=new String(htmlStr.getBytes("ISO-8859-1"), "gbk"); // 读取乱码解决 } } catch(Exception e) { e.printStackTrace(); } return htmlStr; } /** * 获得document * @param url * @return * @throws Exception */ public Document getDocument(String url) throws Exception{ Thread.currentThread().sleep(5000*2); return Jsoup.parse(getHtmlStr(url)); } /** * 获得种类url连接 * @return * @throws Exception */ public List<UrlTO> getCategoryUrls(String url) throws Exception{ Document doc = getDocument(url); List<UrlTO> urlList = new ArrayList<UrlTO>(); Elements elements = doc.select(".navlist").select("li").select("a"); String categoryUrl= null; UrlTO urlTO=null; for(Element element:elements){ categoryUrl = element.attr("href"); urlTO = new UrlTO(); urlTO.setDeptValue(1); urlTO.setUrl(categoryUrl); urlList.add(urlTO); } return urlList; } /*** * 通过分类url获得所有的该类下书籍url * @param categoryUrl * @return * @throws Exception */ public List<UrlTO> getBookUrls(String categoryUrl) throws Exception{ System.out.println("bookUrls-处理进入 deptvalue-==1-"); List<UrlTO> urlTOList = new ArrayList<UrlTO>(); UrlTO urlTO = new UrlTO(); urlTO.setDeptValue(2); String nextUrl = getNextBookUrl(categoryUrl); while(nextUrl != null && !nextUrl.trim().equals("")){ System.out.println("bookUrls--"+nextUrl); urlTO.setUrl(nextUrl); nextUrl = getNextBookUrl(nextUrl); urlTOList.add(urlTO); } return urlTOList; } /** * 获得下一个分页连接 * @param categoryUrl * @return * @throws Exception */ public String getNextBookUrl(String categoryUrl) throws Exception{ Document doc = getDocument(categoryUrl); Elements elements = doc.select("#pagelink").select("strong +a"); if(elements == null){ return null; } return elements.first().attr("href"); } /** * 获取每个页面书籍详情url * @param categoryUrl * @return * @throws Exception */ public List<UrlTO> getDetailUrlList(String categoryUrl) throws Exception{ Document doc = getDocument(categoryUrl); Elements elements = doc.select(".grid").select("tr"); String detailUrl = null; List<UrlTO> urlTOList = new ArrayList<UrlTO>(); UrlTO urlTO = new UrlTO(); for(Element element:elements){ detailUrl = element.select("td").first().attr("href"); urlTO.setDeptValue(3); urlTO.setUrl(detailUrl); urlTOList.add(urlTO); } return urlTOList; } public UrlTO getToReadUrl(String detailUrl) throws Exception{ Document doc = getDocument(detailUrl); UrlTO urlTO = new UrlTO(); String toReadUrl=doc.select("#bt_1").select("a").first().attr("href"); urlTO.setDeptValue(4); urlTO.setUrl(toReadUrl); return urlTO; } /** * 获得chapter的url * @param url * @return * @throws Exception */ public List<UrlTO> getChapterList(String detailUrl) throws Exception { Document doc= getDocument(detailUrl); Elements elements=doc.select(".list").select("dd").select("a"); List<UrlTO> urlList=new ArrayList<UrlTO>(); UrlTO urlTO = new UrlTO(); String chapterUrl= null; for(Element element: elements) { chapterUrl = detailUrl + element.attr("href"); urlTO.setDeptValue(5); urlTO.setUrl(chapterUrl); } return urlList; } /** * * @param chapterUrl * @return * @throws Exception */ public Chapter getChapter(String chapterUrl) throws Exception { Document doc=getDocument(chapterUrl); Chapter chapter=new Chapter(); String name=doc.select("h1").text(); String content=doc.select(".width").text(); chapter.setName(name); chapter.setContent(content); return chapter; } }
2:url实体类
package com.xiaoshuo.to; /** * url保存类 * @author lijunqing * */ public class UrlTO { private Integer deptValue; private String url; public Integer getDeptValue() { return deptValue; } public void setDeptValue(Integer deptValue) { this.deptValue=deptValue; } public String getUrl() { return url; } public void setUrl(String url) { this.url=url; } public String toString(){ return "dept="+deptValue+"--url--"+url; } }
3:队列类
package com.xiaoshuo.url; import java.util.HashSet; import java.util.LinkedList; import java.util.Queue; import java.util.Set; import com.xiaoshuo.to.UrlTO; /** * 保存访问的url * @author lijunqing */ public class LinkQueue { // 已经访问的url集合 private static Set<Object> visitedUrl=new HashSet<Object>(); // 未被访问的url集合 private static Queue<Object> unVisitedUrl=new LinkedList<Object>(); public static Queue<Object> getUnVisitedUrl() { return unVisitedUrl; } public static void removeVisitedUrl(String url) { visitedUrl.remove(url); } public static Object unVisitedPoll() { return unVisitedUrl.poll(); } public static void addVisitedUrl(String url){ System.out.println("已经访问的url--"+url); visitedUrl.add(url); } public static void addUnVisitedUrl(UrlTO url) { if(url!= null && !url.getUrl().trim().equals("")&& !visitedUrl.contains(url.getUrl())){ System.out.println("想队列中添加新的url"+url.getUrl()); unVisitedUrl.offer(url); } } public static Integer getVisitedUrlNum() { return visitedUrl.size(); } public static boolean unVisitedUrlEmpty() { return unVisitedUrl.isEmpty(); } }
4:crawler爬虫类
package com.xiaoshuo.service; import java.util.ArrayList; import java.util.List; import org.junit.Test; import com.xiaoshuo.to.UrlTO; import com.xiaoshuo.url.LinkQueue; import com.xiaoshuo.util.PaserUrlUtil; /** * 宽度优先 * @author lijunqing * */ public class Crawler { PaserUrlUtil paseUrlUtil = new PaserUrlUtil(); /** * 初始化种子 * @param url */ public void initCrawlerBySeed(String url){ UrlTO urlTO = new UrlTO(); urlTO.setDeptValue(0); urlTO.setUrl(url); LinkQueue.addUnVisitedUrl(urlTO); System.out.println("UrlTO-----"+urlTO); } /** * 宽度优先搜索 * @throws Exception */ public void crawlerByBSF() throws Exception{ // 种子url String url = "http://www.shuoshuo520.com/"; //种子入队 initCrawlerBySeed(url); System.out.println("feeds-----"+url); while(!LinkQueue.unVisitedUrlEmpty()){ UrlTO visitUrl = (UrlTO)LinkQueue.unVisitedPoll(); if(visitUrl == null) continue; //放入已经访问的url中 List<UrlTO> unVisitUrlList = null; Integer deptValue = visitUrl.getDeptValue(); String nextUrl = visitUrl.getUrl(); LinkQueue.addVisitedUrl(nextUrl); System.out.println("正在处理的url实体--deptValue--"+deptValue+"--url--"+nextUrl); if(deptValue == 0){ unVisitUrlList = paseUrlUtil.getCategoryUrls(nextUrl); }else if(deptValue == 1){ unVisitUrlList = paseUrlUtil.getBookUrls(nextUrl); }else if(deptValue == 2){ unVisitUrlList = paseUrlUtil.getDetailUrlList(nextUrl); }else if(deptValue == 3){ unVisitUrlList = new ArrayList<UrlTO>(); unVisitUrlList.add(paseUrlUtil.getToReadUrl(nextUrl)); }else if(deptValue == 4){ unVisitUrlList = paseUrlUtil.getChapterList(nextUrl); }else if(deptValue == 5){ //最后一层 } for(UrlTO urlTO: unVisitUrlList){ LinkQueue.addUnVisitedUrl(urlTO); } } } }
5:其实原理差不多,爬虫要定制智能,我的意图是获得该网站数据 到直接插入到数据库中 ,然后建立索引,所以我把每个页面处理封装成对象 插入到数据库中,
6:爬虫的html解析可以用正则表达式,可以把所有的方法重写一个方法 通过配置文件传递表达式或者参数实现对 其他网站的爬虫数据