关于几个坑
程序员文章站
2022-05-02 22:13:41
...
注解模式抓取table中每行tr情况
如上,tr在jsoup中,纯粹的tr开头的html在jsoup解析中,tr和td将会被删除,需要再
[color=red][b]// 给s追加table
if (s.startsWith("<tr>")) {
s = "<table>" + s + "</table>";
}[/b][/color]
[b]xpath的[index]模式[/b]
该模式会对table失效,比如//table[1]
因此如果table前面有div可以采用
//div[@xx='xxxxx'][x]//table的模式
package com.lsiding.robot.page.model;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.model.AfterExtractor;
import us.codecraft.webmagic.model.ConsolePageModelPipeline;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.PageModelExtractorProxy;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.ExtractBy.Type;
/**
* 建造师列表页面(基础信息)
*
* @author liyixing
*
*/
//@TargetUrl("http://59.52.254.106:8093/outQualificationQuery\\S+")
@ExtractBy(value = "//div[@class='addProject']/table//tr", multi = true)
public class BuilderBaseModel implements AfterExtractor {
/**
* 是否已经获取所有的分页列表所有页面
*/
public static boolean isAddTargetRequests = false;
/**
* 重新开始获取所有的分页列表所有页面 liyixing
*/
public static void doIsAddTargetRequests() {
isAddTargetRequests = false;
}
/**
* 获取所有的分页列表所有页面结束 liyixing
*/
public static void endIsAddTargetRequests() {
isAddTargetRequests = true;
}
/**
* 姓名
*/
@ExtractBy(value = "//tr//td[2]//a/allText()", notNull = true, type = Type.XPath)
private String name;
/**
* 详情链接
*/
@ExtractBy(value = "//tr//td[2]/a/@onclick", notNull = true, type = Type.XPath)
private String detail;
/**
* 所属公司
*/
@ExtractBy(value = "//tr//td[3]/allText()", notNull = true, type = Type.XPath)
private String company;
/**
* 注册编号
*/
@ExtractBy(value = "//tr//td[4]/allText()", notNull = true, type = Type.XPath)
private String registerNumber;
/**
* 证书编号
*/
@ExtractBy(value = "//tr//td[5]/allText()", notNull = true, type = Type.XPath)
private String cerNumber;
/**
* 資格證書编号
*/
@ExtractBy(value = "//tr//td[6]/allText()", notNull = true, type = Type.XPath)
private String qualCerNumber;
/**
* 状态
*/
@ExtractBy(value = "//tr//td[7]/allText()", notNull = true, type = Type.XPath)
private String status;
/**
* 注册专业
*/
@ExtractBy(value = "//tr//td[8]/allText()", notNull = false, type = Type.XPath)
private String major;
/**
* 注册有效期
*/
@ExtractBy(value = "//tr//td[9]/allText()", notNull = false, type = Type.XPath)
private String valTime;
/**
* 姓名
*/
public String getName() {
return name;
}
/**
* 姓名
*/
public void setName(String name) {
this.name = name;
}
/**
* 解析完毕
*/
@Override
public void afterProcess(Page page) {
//处理详情地址
detail = "http://59.52.254.106:8093/"+detail.replace("winopen('", "")
.replace("',1000,600,'人员证书轨迹查看');", "");
//根据总记录数计算总页面数
if (!isAddTargetRequests && StringUtils.isNotBlank(name)) {
// 根据分页数,增加爬取链接
Integer allNumber = Integer
.valueOf(page
.getHtml()
.xpath("//div[@class='paging']//span[@class='localPage'][2]/tidyText()")
.get());
//
List<String> urls = new ArrayList<String>();
// 计算总共有多少页
int pageMaxSize = allNumber / 50;
if (allNumber % 50 != 0) {
pageMaxSize++;
}
for (int x = 2; x <= pageMaxSize; x++) {
urls.add("http://59.52.254.106:8093/outQualificationQuery?pageSize=50&pageIndex="
+ x + "&q_certStatus=0");
}
page.addTargetRequests(urls);
endIsAddTargetRequests();
}
}
public static void main(String[] args) throws IllegalAccessException {
BuilderBaseModel.doIsAddTargetRequests();
Spider ooSpider = OOSpider.create(
Site.me()
.setRetryTimes(3)
.setSleepTime(100)
.setUserAgent(
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"),
new ConsolePageModelPipeline(), BuilderBaseModel.class)
// 从"某个地址再试抓取"开始抓
.addUrl("http://59.52.254.106:8093/outQualificationQuery?pageSize=50&pageIndex=1&q_certStatus=0")
// 开启5个线程抓取
.thread(5);
// 启动爬虫
PageModelExtractorProxy.updatePageModelExtractor(ooSpider);
ooSpider.run();
}
}
如上,tr在jsoup中,纯粹的tr开头的html在jsoup解析中,tr和td将会被删除,需要再
public Object process(Page page) {
boolean matched = false;
for (Pattern targetPattern : targetUrlPatterns) {
if (targetPattern.matcher(page.getUrl().toString()).matches()) {
matched = true;
}
}
if (!matched) {
return null;
}
try {
Method method = PageModelExtractor.class.getDeclaredMethod(
"processSingle", Page.class, String.class, boolean.class);
method.setAccessible(true);
if (objectExtractor == null) {
return method.invoke(pageModelExtractor, page, null, true);
} else {
if (objectExtractor.multi) {
List<Object> os = new ArrayList<Object>();
List<String> list = objectExtractor.getSelector()
.selectList(page.getRawText());
for (String s : list) {
Object o;
// 给s追加table
if (s.startsWith("<tr>")) {
s = "<table>" + s + "</table>";
}
o = method.invoke(pageModelExtractor, page, s, false);
if (o != null) {
os.add(o);
}
}
return os;
} else {
String select = objectExtractor.getSelector().select(
page.getRawText());
Object o;
o = method.invoke(pageModelExtractor, page, select, false);
return o;
}
}
} catch (Exception e1) {
throw new RuntimeException(e1);
}
}
[color=red][b]// 给s追加table
if (s.startsWith("<tr>")) {
s = "<table>" + s + "</table>";
}[/b][/color]
[b]xpath的[index]模式[/b]
该模式会对table失效,比如//table[1]
因此如果table前面有div可以采用
//div[@xx='xxxxx'][x]//table的模式
上一篇: 网络爬虫笔记(Day6)——妹子图
下一篇: 爬取搜狗首页的页面数据