分享一个简单的java爬虫框架
程序员文章站
2024-04-01 22:41:04
反复给网站编写不同的爬虫逻辑太麻烦了,自己实现了一个小框架
可以自定义的部分有:
请求方式(默认为getuser-agent为谷歌浏览器的设置),可以通过实现reque...
反复给网站编写不同的爬虫逻辑太麻烦了,自己实现了一个小框架
可以自定义的部分有:
请求方式(默认为getuser-agent为谷歌浏览器的设置),可以通过实现requestset接口来自定义请求方式
储存方式(默认储存在f盘的html文件夹下),可以通过saveutil接口来自定义保存方式
需要保存的资源(默认为整个html页面)
筛选方式(默认所有url都符合要求),通过实现resoursechooser接口来自定义需要保存的url和资源页面
实现的部分有:
html页面的下载方式,通过httpclient实现html页面的下载
html页面的解析部分,通过jsoup实现html页面的解析
htmldownloader类,用于根据一个url下载一个html页面
package downloadpackage; import java.io.bufferedreader; import java.io.ioexception; import java.io.inputstreamreader; import org.apache.http.httpentity; import org.apache.http.httpresponse; import org.apache.http.impl.client.closeablehttpclient; import org.apache.http.impl.client.httpclients; /* * 根据一个url下载一个html页面 */ public class htmldownloader { requestset requestset = null; public htmldownloader(requestset requestset){ this.requestset = requestset; } public string downloadhtml(string url){ string html = null; //创建一个客户端 //创建一个读取流从entity读取html bufferedreader reader = null; closeablehttpclient httpclient = httpclients.createdefault(); httpresponse response = null; try { response = httpclient.execute(requestset.getmethod(url)); httpentity entity = response.getentity(); reader = new bufferedreader(new inputstreamreader(entity.getcontent())); stringbuilder sb = new stringbuilder(); while((html = reader.readline()) != null){ sb.append(html); } html = sb.tostring(); system.out.println("一个html页面获取成功"); } catch (ioexception e) { system.out.println(url+"连接失败"); } finally{ if(reader != null){ try { reader.close(); httpclient.close(); } catch (ioexception e) { // todo auto-generated catch block e.printstacktrace(); } } } return html; } }
urlget类,用于根据一个html页面获得所有的url连接
package downloadpackage; import java.util.linkedlist; import org.jsoup.jsoup; import org.jsoup.nodes.document; import org.jsoup.nodes.element; import org.jsoup.select.elements; public class urlget { public linkedlist<string> geturls(string html){ linkedlist<string> urls = new linkedlist<string>(); document doc = jsoup.parse(html); elements links = doc.getelementsbytag("a"); for (element link:links){ string url = link.attr("href"); urls.add(url); } return urls; } }
资源选择接口,需要实现三个方法,第一是isneed方法,判断url是否为需要的,第二个是isresourse方法,判断url页面是不是需要的资源页面,第三个是process方法,
有时网页上的url是我们需要的但是格式不对,对url进行加工
package choosepackage; public interface resoursechooser { public boolean isneed(string url); public boolean isresourse(string url); public string process(string url); }
requsetset类,用于自定义请求方法的接口,实现getmethod方法获取请求方法
package downloadpackage; import org.apache.http.client.methods.httpget; /* * 一个用于获得request请求的接口 * 实现getmethod方法获取get方法 */ public interface requestset { public httpget getmethod(string url); } saveutil接口用于自定义保存方式,需要实现save方法 package saveutil; /* * 数据储存的工具接口,必须实现保存方法 */ public interface saveutil { public void save(string url,string html); }
spider类,有五中构造方法,可以实现多种自定义操作,其中实现了上述自定义接口的默认实现类
package spider; import java.io.bufferedwriter; import java.io.file; import java.io.filewriter; import java.io.ioexception; import java.util.hashset; import java.util.iterator; import java.util.linkedlist; import org.apache.http.client.config.requestconfig; import org.apache.http.client.methods.httpget; import choosepackage.myresoursechooser; import choosepackage.resoursechooser; import downloadpackage.htmldownloader; import downloadpackage.requestset; import downloadpackage.urlget; import saveutil.mysaveutil; import saveutil.saveutil; /* * 用于爬取资源的类 */ public class spider{ public static void main(string[] args) { new spider("http://www.bilibili.net").spiderstart(); } //种子url string seed = null; //用于保存数据的类,需要自己实现 private saveutil saveutil = null; //html下载类 private htmldownloader downloader = null; //url下载类 private urlget urldownloader = null; //资源选择工具 private resoursechooser resoursechooser = null; //用于保存未下载的网页 linkedlist<string> unvisited = new linkedlist<string>(); //用于保存已下载的网页 hashset<string> visited = new hashset<string>(); //自定义储存方式,请求方式,资源筛选方式的构造方法 public spider(saveutil saveutil,requestset request,resoursechooser resoursechooser,string seed){ this.saveutil = saveutil; this.downloader = new htmldownloader(request); this.urldownloader = new urlget(); this.resoursechooser = resoursechooser; this.seed = seed; unvisited.add(seed); } //自定义储存方式,资源筛选方式的构造方法 public spider(saveutil saveutil,resoursechooser resoursechooser,string seed){ this.resoursechooser = resoursechooser; this.downloader = new htmldownloader(new getrequest()); this.saveutil = saveutil; this.urldownloader = new urlget(); this.seed = seed; unvisited.add(seed); } //自定义储存方式,请求的构造方法 public spider(saveutil saveutil,requestset requestset,string seed){ this.saveutil = saveutil; this.downloader = new htmldownloader(requestset); this.resoursechooser = new myresoursechooser(); this.urldownloader = new urlget(); this.seed = seed; unvisited.add(seed); } //自定义储存方式的构造方法 public spider(saveutil saveutil,string seed){ this.saveutil = saveutil; this.downloader = new htmldownloader(new getrequest()); this.resoursechooser = (new myresoursechooser()); this.urldownloader = new urlget(); this.seed = seed; unvisited.add(seed); } //默认的爬虫构造方法 public spider(string seed){ this.saveutil = new mysaveutil(); this.downloader = new htmldownloader(new getrequest()); this.resoursechooser = (new myresoursechooser()); this.urldownloader = new urlget(); this.seed = seed; unvisited.add(seed); } //开始爬取的方法 private void spiderstart(){ string html = null; while(!unvisited.isempty()){ string url = unvisited.poll(); system.out.println("开始获取"+url); if(resoursechooser.isneed(url)){ try{ html = downloader.downloadhtml(url); } catch(runtimeexception e){ system.out.println(url+"连接获取失败"); continue; } visited.add(url); linkedlist<string> urls = new linkedlist<string>(); try{ urls = urldownloader.geturls(html); } catch(runtimeexception e){ system.out.println(url+"的html页面为空"); continue; } iterator<string> it = urls.iterator(); while(it.hasnext()){ string newurl = it.next(); if(resoursechooser.isneed(newurl)&&!visited.contains(newurl)&&!unvisited.contains(newurl)){ newurl = resoursechooser.process(newurl); unvisited.add(newurl); system.out.println(newurl+"加入页面"); } } system.out.println("获取了"+url+"上的所有url"); if(resoursechooser.isresourse(url)){ saveutil.save(url,html); } } } } //默认资源筛选类 private class myresoursechooser implements resoursechooser{ @override public boolean isneed(string url) { // todo auto-generated method stub if(!url.startswith("/")&&!url.startswith("http")){ return false; } return true; } @override public boolean isresourse(string url) { // todo auto-generated method stub return true; } @override public string process(string url) { // todo auto-generated method stub if(!url.startswith("http")){ url = seed+url; } return url; } } public class getrequest implements requestset{ public httpget getmethod(string url) { // todo auto-generated method stub //创建一个get请求方法 httpget getmethod = new httpget(url); //httphost proxy = new httphost("124.88.67.81",80);这里不设置代理ip //设置请求超时时间等 requestconfig responseconfig = requestconfig.custom().setconnectionrequesttimeout(10000).setconnecttimeout(10000).setsockettimeout(10000).build(); //设置请求头,主要是user-agent getmethod.addheader("user-agent","mozilla/5.0 (windows nt 10.0; wow64) applewebkit/537.36 (khtml, like gecko) chrome/56.0.2924.87 safari/537.36"); //设置请求参数 getmethod.setconfig(responseconfig); return getmethod; } } //默认的存储类 public class mysaveutil implements saveutil{ @override public void save(string url, string html) { // todo auto-generated method stub string filename = getfilename(url); bufferedwriter writer = null; try{ writer = new bufferedwriter(new filewriter(filename)); writer.write(html); writer.flush(); system.out.println("文件写入成功"); } catch(ioexception e){ system.out.println("文件写入失败"); } finally{ try { if(writer != null) writer.close(); } catch (ioexception e) { // todo auto-generated catch block system.out.println("流关闭失败"); } } } private string getfilename(string url){ string fileparentpath = "f://html"; file file = new file(fileparentpath); if(!file.exists()){ file.mkdir(); } int last = url.lastindexof("."); int first = url.indexof("."); url = url.substring(first,last); url = url.replaceall("\\.", ""); url = url.replaceall("/", ""); return fileparentpath+"/"+url+".txt"; } } }
总结
以上就是本文关于分享一个简单的java爬虫框架的全部内容,希望对大家有所帮助。感兴趣的朋友可以继续参阅本站:python爬虫实例爬取网站搞笑段子、java线程之锁对象lock-同步问题更完美的处理方式代码实例、java编程几个循环实例代码分享等,有什么问题可以随时留言,小编会及时回复大家的。感谢朋友们对本站的支持!