使用webmagic实现爬虫程序示例分享
package com.letv.cloud.spider;
import java.util.hashset;
import java.util.list;
import us.codecraft.webmagic.page;
import us.codecraft.webmagic.site;
import us.codecraft.webmagic.spider;
import us.codecraft.webmagic.processor.pageprocessor;
public class moviepaperpageprocessor implements pageprocessor {
private site page = site.me().setretrytimes(3).setsleeptime(1000);
public site getsite() {
return page;
}
public void process(page page) {
list<string> links = page.gethtml().links().regex(
"http://posters.aa.com/poster/\\d+").all();
links = removeduplicate(links);
page.addtargetrequests(links);
page.putfield("title", page.gethtml().xpath(
"//div[@id='imdbleftsecc']/center/h1/text()").tostring());
page.putfield("imgurl", page.gethtml().xpath(
"//div[@id='imdbleftsecc']/center/img/@src").tostring());
}
public static void main(string[] args) {
for (int i = 1; i <= 3; i++) {
spider.create(new moviepaperpageprocessor()).addurl(
"http://posters.aa.co/poster_page/" + i).thread(5).run();
}
}
public static list removeduplicate(list list) {
hashset hs = new hashset(list);
list.clear();
list.addall(hs);
return list;
}
}
上一篇: 一个开发人员眼中的JSP技术(下)