关于几个坑
程序员文章站
2022-07-14 08:31:33
...
注解模式抓取table中每行tr情况
如上,tr在jsoup中,纯粹的tr开头的html在jsoup解析中,tr和td将会被删除,需要再
// 给s追加table
if (s.startsWith("<tr>")) {
s = "<table>" + s + "</table>";
}
xpath的[index]模式
该模式会对table失效,比如//table[1]
因此如果table前面有div可以采用
//div[@xx='xxxxx'][x]//table的模式
package com.lsiding.robot.page.model; import java.util.ArrayList; import java.util.List; import org.apache.commons.lang3.StringUtils; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.model.AfterExtractor; import us.codecraft.webmagic.model.ConsolePageModelPipeline; import us.codecraft.webmagic.model.OOSpider; import us.codecraft.webmagic.model.PageModelExtractorProxy; import us.codecraft.webmagic.model.annotation.ExtractBy; import us.codecraft.webmagic.model.annotation.ExtractBy.Type; /** * 建造师列表页面(基础信息) * * @author liyixing * */ //@TargetUrl("http://59.52.254.106:8093/outQualificationQuery\\S+") @ExtractBy(value = "//div[@class='addProject']/table//tr", multi = true) public class BuilderBaseModel implements AfterExtractor { /** * 是否已经获取所有的分页列表所有页面 */ public static boolean isAddTargetRequests = false; /** * 重新开始获取所有的分页列表所有页面 liyixing */ public static void doIsAddTargetRequests() { isAddTargetRequests = false; } /** * 获取所有的分页列表所有页面结束 liyixing */ public static void endIsAddTargetRequests() { isAddTargetRequests = true; } /** * 姓名 */ @ExtractBy(value = "//tr//td[2]//a/allText()", notNull = true, type = Type.XPath) private String name; /** * 详情链接 */ @ExtractBy(value = "//tr//td[2]/a/@onclick", notNull = true, type = Type.XPath) private String detail; /** * 所属公司 */ @ExtractBy(value = "//tr//td[3]/allText()", notNull = true, type = Type.XPath) private String company; /** * 注册编号 */ @ExtractBy(value = "//tr//td[4]/allText()", notNull = true, type = Type.XPath) private String registerNumber; /** * 证书编号 */ @ExtractBy(value = "//tr//td[5]/allText()", notNull = true, type = Type.XPath) private String cerNumber; /** * 資格證書编号 */ @ExtractBy(value = "//tr//td[6]/allText()", notNull = true, type = Type.XPath) private String qualCerNumber; /** * 状态 */ @ExtractBy(value = "//tr//td[7]/allText()", notNull = true, type = Type.XPath) private String status; /** * 注册专业 */ @ExtractBy(value = "//tr//td[8]/allText()", notNull = false, type = Type.XPath) private String major; /** * 注册有效期 */ @ExtractBy(value = "//tr//td[9]/allText()", notNull = false, type = Type.XPath) private String valTime; /** * 姓名 */ public String getName() { return name; } /** * 姓名 */ public void setName(String name) { this.name = name; } /** * 解析完毕 */ @Override public void afterProcess(Page page) { //处理详情地址 detail = "http://59.52.254.106:8093/"+detail.replace("winopen('", "") .replace("',1000,600,'人员证书轨迹查看');", ""); //根据总记录数计算总页面数 if (!isAddTargetRequests && StringUtils.isNotBlank(name)) { // 根据分页数,增加爬取链接 Integer allNumber = Integer .valueOf(page .getHtml() .xpath("//div[@class='paging']//span[@class='localPage'][2]/tidyText()") .get()); // List<String> urls = new ArrayList<String>(); // 计算总共有多少页 int pageMaxSize = allNumber / 50; if (allNumber % 50 != 0) { pageMaxSize++; } for (int x = 2; x <= pageMaxSize; x++) { urls.add("http://59.52.254.106:8093/outQualificationQuery?pageSize=50&pageIndex=" + x + "&q_certStatus=0"); } page.addTargetRequests(urls); endIsAddTargetRequests(); } } public static void main(String[] args) throws IllegalAccessException { BuilderBaseModel.doIsAddTargetRequests(); Spider ooSpider = OOSpider.create( Site.me() .setRetryTimes(3) .setSleepTime(100) .setUserAgent( "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"), new ConsolePageModelPipeline(), BuilderBaseModel.class) // 从"某个地址再试抓取"开始抓 .addUrl("http://59.52.254.106:8093/outQualificationQuery?pageSize=50&pageIndex=1&q_certStatus=0") // 开启5个线程抓取 .thread(5); // 启动爬虫 PageModelExtractorProxy.updatePageModelExtractor(ooSpider); ooSpider.run(); } }
如上,tr在jsoup中,纯粹的tr开头的html在jsoup解析中,tr和td将会被删除,需要再
public Object process(Page page) { boolean matched = false; for (Pattern targetPattern : targetUrlPatterns) { if (targetPattern.matcher(page.getUrl().toString()).matches()) { matched = true; } } if (!matched) { return null; } try { Method method = PageModelExtractor.class.getDeclaredMethod( "processSingle", Page.class, String.class, boolean.class); method.setAccessible(true); if (objectExtractor == null) { return method.invoke(pageModelExtractor, page, null, true); } else { if (objectExtractor.multi) { List<Object> os = new ArrayList<Object>(); List<String> list = objectExtractor.getSelector() .selectList(page.getRawText()); for (String s : list) { Object o; // 给s追加table if (s.startsWith("<tr>")) { s = "<table>" + s + "</table>"; } o = method.invoke(pageModelExtractor, page, s, false); if (o != null) { os.add(o); } } return os; } else { String select = objectExtractor.getSelector().select( page.getRawText()); Object o; o = method.invoke(pageModelExtractor, page, select, false); return o; } } } catch (Exception e1) { throw new RuntimeException(e1); } }
// 给s追加table
if (s.startsWith("<tr>")) {
s = "<table>" + s + "</table>";
}
xpath的[index]模式
该模式会对table失效,比如//table[1]
因此如果table前面有div可以采用
//div[@xx='xxxxx'][x]//table的模式