springboot+webmagic实现java爬虫jdbc及mysql的方法
程序员文章站
2024-02-22 13:04:10
前段时间需要爬取网页上的信息,自己对于爬虫没有任何了解,就了解了一下webmagic,写了个简单的爬虫。
一、首先介绍一下webmagic:
webmagic采用完全模...
前段时间需要爬取网页上的信息,自己对于爬虫没有任何了解,就了解了一下webmagic,写了个简单的爬虫。
一、首先介绍一下webmagic:
webmagic采用完全模块化的设计,功能覆盖整个爬虫的生命周期(链接提取、页面下载、内容抽取、持久化),支持多线程抓取,分布式抓取,并支持自动重试、自定义ua/cookie等功能。
实现理念:
maven依赖:
<dependency> <groupid>us.codecraft</groupid> <artifactid>webmagic-core</artifactid> <version>0.7.3</version> </dependency> <dependency> <groupid>us.codecraft</groupid> <artifactid>webmagic-extension</artifactid> <version>0.7.3</version> </dependency> <dependency> <groupid>us.codecraft</groupid> <artifactid>webmagic-extension</artifactid> <version>0.7.3</version> <exclusions> <exclusion> <groupid>org.slf4j</groupid> <artifactid>slf4j-log4j12</artifactid> </exclusion> </exclusions> </dependency>
jdbc模式:
ublic class csdnblogdao { private connection conn = null; private statement stmt = null; public csdnblogdao() { try { class.forname("com.mysql.jdbc.driver"); string url = "jdbc:mysql://localhost:3306/test?" + "user=***&password=***3&useunicode=true&characterencoding=utf8"; conn = drivermanager.getconnection(url); stmt = conn.createstatement(); } catch (classnotfoundexception e) { e.printstacktrace(); } catch (sqlexception e) { e.printstacktrace(); } } public int add(csdnblog csdnblog) { try { string sql = "insert into `test`.`csdnblog` (`keyes`, `titles`, `content` , `dates`, `tags`, `category`, `views`, `comments`, `copyright`) values (?, ?, ?, ?, ?, ?, ?, ?,?);"; preparedstatement ps = conn.preparestatement(sql); ps.setint(1, csdnblog.getkey()); ps.setstring(2, csdnblog.gettitle()); ps.setstring(3,csdnblog.getcontent()); ps.setstring(4, csdnblog.getdates()); ps.setstring(5, csdnblog.gettags()); ps.setstring(6, csdnblog.getcategory()); ps.setint(7, csdnblog.getview()); ps.setint(8, csdnblog.getcomments()); ps.setint(9, csdnblog.getcopyright()); return ps.executeupdate(); } catch (sqlexception e) { e.printstacktrace(); } return -1; } }
实体类:
public class csdnblog { private int key;// 编号 private string title;// 标题 private string dates;// 日期 private string tags;// 标签 private string category;// 分类 private int view;// 阅读人数 private int comments;// 评论人数 private int copyright;// 是否原创 private string content; //文字内容 public string getcontent() { return content; } public void setcontent(string content) { this.content = content; } public int getkey() { return key; } public void setkey(int key) { this.key = key; } public string gettitle() { return title; } public void settitle(string title) { this.title = title; } public string getdates() { return dates; } public void setdates(string dates) { this.dates = dates; } public string gettags() { return tags; } public void settags(string tags) { this.tags = tags; } public string getcategory() { return category; } public void setcategory(string category) { this.category = category; } public int getview() { return view; } public void setview(int view) { this.view = view; } public int getcomments() { return comments; } public void setcomments(int comments) { this.comments = comments; } public int getcopyright() { return copyright; } public void setcopyright(int copyright) { this.copyright = copyright; } @override public string tostring() { return "csdnblog [key=" + key + ", title=" + title + ", content=" + content + ",dates=" + dates + ", tags=" + tags + ", category=" + category + ", view=" + view + ", comments=" + comments + ", copyright=" + copyright + "]"; } }
启动类:
public class csdnblogpageprocessor implements pageprocessor { private static string username="chenyufeng1991"; // 设置csdn用户名 private static int size = 0;// 共抓取到的文章数量 // 抓取网站的相关配置,包括:编码、抓取间隔、重试次数等 private site site = site.me().setretrytimes(3).setsleeptime(1000); public site getsite() { return site; } // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 public void process(page page) { // 列表页 if (!page.geturl().regex("http://blog\\.csdn\\.net/" + username + "/article/details/\\d+").match()) { // 添加所有文章页 page.addtargetrequests(page.gethtml().xpath("//div[@id='article_list']").links()// 限定文章列表获取区域 .regex("/" + username + "/article/details/\\d+") .replace("/" + username + "/", "http://blog.csdn.net/" + username + "/")// 巧用替换给把相对url转换成绝对url .all()); // 添加其他列表页 page.addtargetrequests(page.gethtml().xpath("//div[@id='papelist']").links()// 限定其他列表页获取区域 .regex("/" + username + "/article/list/\\d+") .replace("/" + username + "/", "http://blog.csdn.net/" + username + "/")// 巧用替换给把相对url转换成绝对url .all()); // 文章页 } else { size++;// 文章数量加1 // 用csdnblog类来存抓取到的数据,方便存入数据库 csdnblog csdnblog = new csdnblog(); // 设置编号 csdnblog.setkey(integer.parseint( page.geturl().regex("http://blog\\.csdn\\.net/" + username + "/article/details/(\\d+)").get())); // 设置标题 csdnblog.settitle( page.gethtml().xpath("//div[@class='article_title']//span[@class='link_title']/a/text()").get()); //设置内容 csdnblog.setcontent( page.gethtml().xpath("//div[@class='article_content']/alltext()").get()); // 设置日期 csdnblog.setdates( page.gethtml().xpath("//div[@class='article_r']/span[@class='link_postdate']/text()").get()); // 设置标签(可以有多个,用,来分割) csdnblog.settags(listtostring(page.gethtml().xpath("//div[@class='article_l']/span[@class='link_categories']/a/alltext()").all())); // 设置类别(可以有多个,用,来分割) csdnblog.setcategory(listtostring(page.gethtml().xpath("//div[@class='category_r']/label/span/text()").all())); // 设置阅读人数 csdnblog.setview(integer.parseint(page.gethtml().xpath("//div[@class='article_r']/span[@class='link_view']") .regex("(\\d+)人阅读").get())); // 设置评论人数 csdnblog.setcomments(integer.parseint(page.gethtml() .xpath("//div[@class='article_r']/span[@class='link_comments']").regex("\\((\\d+)\\)").get())); // 设置是否原创 csdnblog.setcopyright(page.gethtml().regex("bog_copyright").match() ? 1 : 0); // 把对象存入数据库 new csdnblogdao().add(csdnblog); // 把对象输出控制台 system.out.println(csdnblog); } } // 把list转换为string,用,分割 public static string listtostring(list<string> stringlist) { if (stringlist == null) { return null; } stringbuilder result = new stringbuilder(); boolean flag = false; for (string string : stringlist) { if (flag) { result.append(","); } else { flag = true; } result.append(string); } return result.tostring(); } public static void main(string[] args) { long starttime, endtime; system.out.println("【爬虫开始】..."); starttime = system.currenttimemillis(); // 从用户博客首页开始抓,开启5个线程,启动爬虫 spider.create(new csdnblogpageprocessor()).addurl("http://blog.csdn.net/" + username).thread(5).run(); endtime = system.currenttimemillis(); system.out.println("【爬虫结束】共抓取" + size + "篇文章,耗时约" + ((endtime - starttime) / 1000) + "秒,已保存到数据库,请查收!"); } }
使用mysql类型:
public class gamepageprocessor implements pageprocessor { private static final logger logger = loggerfactory.getlogger(gamepageprocessor.class); private static dianjingservice d; private static bannerservice bs; private static sportservice ss; private static yulenewsservice ys; private static updateservice ud ; // 抓取网站的相关配置,包括:编码、抓取间隔、重试次数等 private site site = site.me().setretrytimes(3).setsleeptime(1000); public site getsite() { return site; } // process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑 public static void main(string[] args) { configurableapplicationcontext context= springapplication.run(gamepageprocessor.class, args); d = context.getbean(dianjingservice.class); //spider.create(new gamepageprocessor()).addurl("网址").thread(5).run(); } public void process(page page) { selectable url = page.geturl(); if (url.tostring().equals("网址")) { dianjingvideo dv = new dianjingvideo(); list<string> ls = page.gethtml().xpath("//div[@class='v']/div[@class='v-meta va']/div[@class='v-meta-title']/a/text()").all(); //hrefs list<string> ls1 = page.gethtml().xpath("//div[@class='v']/div[@class='v-link']/a/@href").all();//获取a标签的href list<string> ls2 = page.gethtml().xpath("//div[@class='v']/div[@class='v-meta va']/div[@class='v-meta-entry']/div[@class='v-meta-data']/span[@class='r']/text()").all(); //photo list<string> ls3 = page.gethtml().xpath("//div[@class='v']/div[@class='v-thumb']/img/@src").all(); for (int i = 0; i < 5; i++) { dv.settitles(ls.get(i)); dv.setcategory(""); dv.setdates(ls2.get(i)); dv.sethrefs(ls1.get(i)); dv.setphoto(ls3.get(i)); dv.setsources(""); d.addvideo(dv); } } }
controller:
@controller @requestmapping(value = "/dianjing") public class dianjingcontroller { @autowired private dianjingservice s; /* 手游 */ @requestmapping("/dianjing") @responsebody public object dianjing(){ list<dianjing> list = s.find2(); jsonobject jo = new jsonobject(); if(list!=null){ jo.put("code",0); jo.put("success",true); jo.put("count",list.size()); jo.put("list",list); } return jo; } }
实体类就不展示了
dao层
@insert("insert into dianjing (titles,dates,category,hrefs,photo,sources) values(#{titles},#{dates},#{category},#{hrefs},#{photo},#{sources})") int adddj(dianjing dj);
以上这篇springboot+webmagic实现java爬虫jdbc及mysql的方法就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持。
上一篇: 设计模式--单例模式