MiseringThread.java 解析页面线程
程序员文章站
2022-06-10 18:46:02
...
http://injavawetrust.iteye.com
package com.iteye.injavawetrust.miner;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* 解析页面线程
* @author InJavaWeTrust
*
*/
public class MiseringThread extends Thread {
private static final Log LOG = LogFactory.getLog(MiseringThread.class);
private MinerConfig config = null;
public MiseringThread(MinerConfig config) {
this.config = config;
}
@Override
public void run() {
while (!MinerMonitorThread.done) {
misering();
}
}
private synchronized void misering() {
Html html = MinerQueue.waitingMiseringPoll(); // 等待提取URL的分析页面出队列
if (null == html || MinerUtil.isBlank(html.getHtml())) {
return;
}
//当前页面深度<爬取深度 取出当前页面全部URL
if (html.getDepth() < config.getMaxDepth()) {
LOG.info("MiseringThread获取页面[" + html.getUrl() + "]下所有URL。。。。。。 当前线程 [" + Thread.currentThread().getName() + "]");
Set<String> urls = MinerUtil.getAllUrl(html.getUrl());
for(String url : urls){
if(null == url || url.equals("")){
continue;
}
if(url.substring(url.length() - 1, url.length()).equals("/")){
url = url.substring(0, url.length() - 1);
}
MinerUrl minerUrl = new MinerUrl();
minerUrl.setUrl(url);
minerUrl.setDepth(html.getDepth() + 1); // 爬取深度+1
// 判断URL列表是否包含关键字
if(!MinerUtil.checkKeys(url, config.getKeys())){
continue;
}
// 添加到待访问队列,每个URL只访问一次
MinerQueue.addUnVisited(minerUrl);
// 将页面URL 添加到URL队列 保证每个URL只访问一次
MinerQueue.addUrlSet(minerUrl.getUrl());
}
}
}
}