欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

使用Heritrix爬取国内某最火的电子商城的笔记本信息遇到的问题

程序员文章站 2022-06-08 10:13:12
...
    网上的某些资料也说了对Heritrix进行扩展定制即可个性化的从网上爬取自己需要的内容,为建立垂直搜索作做好准备。
    这里主要扩展FrontierScheduler和Extractor两个class,由于是个demo所以比较简单,由于这些代码仅仅是周日下午和今天晚上抽时间些的,故效率上没有可考虑,比如太多的正则,反射方法构造后应该缓存等,仅供参考

    扩展如下:

    class FrontierSchedulerFor360Buy extends FrontierScheduler

    //至于为什么要扩展schedule大家应该都知道,CandidateURI表示代选的链接,那么意思就是对将要进入候选的链接进行业务上的过滤

    注意:这里有个问题,就是如果需要过滤符合某个格式的链接,但是这个格式的链接不是全部过滤,而且也找不到一定的业务规则去过滤,那么就不能在这里过滤了,后面在Extractor中讲解


	protected void schedule(CandidateURI caUri) {
		// TODO Auto-generated method stub
		String url = caUri.toString();
		System.err.println(">>>++++++ " + url);

		if(url.indexOf("dns:www.360buy.com") != -1){
			System.err.println(">>>>>>++++++ " + url);
			this.getController().getFrontier().schedule(caUri);
		}
		if(url.indexOf("http://www.360buy.com/robots.txt") != -1){
			System.err.println(">>>>>>++++++ " + url);
			this.getController().getFrontier().schedule(caUri);
		}

		if(url.indexOf(PATTERN_360_NOTEBOOK)!=-1){
			System.err.println(">>>>>>++++++ " + url);
			this.getController().getFrontier().schedule(caUri);
		}
		
		if(url.matches(this.PATTERN_360_NOTEBOOK_BRAND)){
			if(url.indexOf("670-671-672-0")!=-1){
				return;
			}
			System.err.println(">>>>>>++++++ " + url);
			this.getController().getFrontier().schedule(caUri);
		}
		
	}


     
    class ExtractorFor360Buy extends Extractor
    这里扩展的话基本是对crawlUri的解析,以及对其中的一部分符合业务规则的链接加入到候选的链接(已经加入就需要过滤了,这就遇到了上面红色注意部分说道的问题),这样我们需要直接将链接交由 BdbFrontier 处理,那么根据源码查看,是因为在schedul候选链接CadicateUri的时候,会getPathFromSeed(),所以需要通过反射设置pathFromSeed


以下为具体的异常:

java.lang.NullPointerException
	at org.archive.crawler.datamodel.CandidateURI.getTransHops(CandidateURI.java:382)
	at org.archive.crawler.frontier.AbstractFrontier.applySpecialHandling(AbstractFrontier.java:727)
	at org.archive.crawler.frontier.WorkQueueFrontier.receive(WorkQueueFrontier.java:442)
	at org.archive.crawler.util.SetBasedUriUniqFilter.add(SetBasedUriUniqFilter.java:90)
	at org.archive.crawler.frontier.WorkQueueFrontier.schedule(WorkQueueFrontier.java:427)
	at com.awen.heritrix.ExtractorFor360Buy.extract(ExtractorFor360Buy.java:124)
	at org.archive.crawler.extractor.Extractor.innerProcess(Extractor.java:67)
	at org.archive.crawler.framework.Processor.process(Processor.java:109)
	at org.archive.crawler.framework.ToeThread.processCrawlUri(ToeThread.java:306)
	at org.archive.crawler.framework.ToeThread.run(ToeThread.java:154)


protected void extract(CrawlURI curi) {
		// TODO Auto-generated method stub
		String currentUrl = curi.toString();
		//如果是笔记本品牌的话,直接解析,进行数据的
		if(currentUrl.matches(this.PATTERN_360_NOTEBOOK_BRAND_ABSOLUTE)){
			
			NodeFilter reputation_clild_filter = new AndFilter(new TagNameFilter("span"), 
				      new HasAttributeFilter("class", "reputation"));
			/**
			 * 此filter为笔记本产品的基本信息,标题,价格,评论链接,图片等
			 */
			NodeFilter div_child_filter = new HasChildFilter(reputation_clild_filter);
			NodeFilter product_filter = new HasChildFilter(div_child_filter);
			try {
				parser.setURL(currentUrl);
				//获取品牌的编码http://www.360buy.com/products/670-671-672-[1-9][\\d]*+-0-0-0-0-0-0-1-1-[\\d].html
				String brandNo ="";
				brandNo = currentUrl.substring(currentUrl.indexOf("670-671-672-") + 12, currentUrl.indexOf("-", "http://www.360buy.com/products/670-671-672-".length()));
				NodeList nodeList = parser.parse(product_filter);
				for (int i = 0; i < nodeList.size(); ++i){
					Node node_li = nodeList.elementAt(i);					
					//String node_li_html = node_li.toHtml();使用正则匹配的话效率太低
					
					try {
						
						String content = node_li.toHtml();
						
						String detail="";
						String img="";
						String price="";
						String remark="";
						String review = "";
						
						//产品详情链接
						Pattern pattern = Pattern.compile(this.PATTERN_360_PRODUCT,Pattern.CASE_INSENSITIVE);
						Matcher matcher = pattern.matcher(content);
						if(matcher.find()){
							String temp = matcher.group();
							
							CandidateURI cUri = new CandidateURI(UURIFactory.getInstance(temp));
							//在往BdbFrontier直接加入CandidateURI过程中会调用CandidateURI的getPathFromSeed,由于为保护方法,所以需要反射设置
							Method m = cUri.getClass().getDeclaredMethod("setPathFromSeed", String.class);
							m.setAccessible(true);
							m.invoke(cUri, temp);
							m.setAccessible(false);
							this.getController().getFrontier().schedule(cUri);
							
							detail = temp.substring(temp.lastIndexOf("/")+1, temp.indexOf(".html"));
							System.err.println(detail);
						}
						
						//产品图片链接
						Pattern pattern1 = Pattern.compile(this.PATTERN_360_PRODUCT_IMG,Pattern.CASE_INSENSITIVE);
						Matcher matcher1 = pattern1.matcher(content);
						if(matcher1.find()){
							String temp = matcher1.group();
							this.addLindFromString(curi, temp, "", Link.EMBED_HOP);
							img = temp.substring(temp.lastIndexOf("/")+1, temp.indexOf(".jpg"));
							System.err.println(img);
							this.getController().getFrontier().schedule(new org.archive.crawler.datamodel.CandidateURI(UURIFactory.getInstance(temp)));
						}
						
						//产品评论链接
						Pattern pattern2 = Pattern.compile(this.PATTERN_360_PRODUCT_EVALUATE,Pattern.CASE_INSENSITIVE);
						Matcher matcher2 = pattern2.matcher(content);
						if(matcher2.find()){
							String temp = matcher2.group();
							//this.addLindFromString(curi, temp, "", Link.NAVLINK_HOP);
							review = temp;
						}
						
						
						//价格
						NodeFilter price_filter = new AndFilter(new TagNameFilter("div"), 
							      new HasAttributeFilter("class", "p-price"));
						//名称
						NodeFilter name_filter = new AndFilter(new TagNameFilter("div"), 
							      new HasAttributeFilter("class", "p-name"));			
						
						//获取价格
						Parser parser = new Parser();
						parser.setInputHTML(content);
						NodeList price_list = parser.parse(price_filter);
						TextExtractingVisitor vistior = new TextExtractingVisitor();
						new Parser(price_list.elementAt(0).toHtml()).visitAllNodesWith(vistior);
						price =vistior.getExtractedText();
						
						System.err.println("price = " + price);
						
						//获取名称描述
						Parser parser1 = new Parser();
						parser1.setInputHTML(content);
						NodeList name_list = parser1.parse(name_filter);
						TextExtractingVisitor vistior1 = new TextExtractingVisitor();
						new Parser(name_list.elementAt(0).toHtml()).visitAllNodesWith(vistior1);
						remark = vistior1.getExtractedText();
						
						System.err.println("name = " + remark);
						try {
							File file = new File("/home/awen/360/" + brandNo + "_" + detail + ".txt");
							if(!file.exists()){
								file.createNewFile();
							}
							BufferedWriter bw = new BufferedWriter(new FileWriter(file));
							bw.write(brandNo + ls);//品牌
							bw.write(detail + ls);//详情链接后缀
							bw.write(img + ls);//图片链接后缀
							bw.write(price + ls);//价格
							bw.write(review + ls);//评价链接							
							bw.write(remark + ls);//描述
							bw.flush();
							bw.close();
						} catch (IOException e) {
							// TODO Auto-generated catch block
							e.printStackTrace();
						}
					
						
					}  catch (ParserException e) {
						// TODO Auto-generated catch block
						e.printStackTrace();
					} catch (URIException e) {
						// TODO Auto-generated catch block
						e.printStackTrace();
					} catch (SecurityException e) {
						// TODO Auto-generated catch block
						e.printStackTrace();
					} catch (NoSuchMethodException e) {
						// TODO Auto-generated catch block
						e.printStackTrace();
					} catch (IllegalArgumentException e) {
						// TODO Auto-generated catch block
						e.printStackTrace();
					} catch (IllegalAccessException e) {
						// TODO Auto-generated catch block
						e.printStackTrace();
					} catch (InvocationTargetException e) {
						// TODO Auto-generated catch block
						e.printStackTrace();
					}
								
					

		        }
			} catch (ParserException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
		
		//如果不是笔记本品牌的话,直接解析
		HttpRecorder recorder = curi.getHttpRecorder();
		ReplayCharSequence rcs = null;
		try {
			rcs = recorder.getReplayCharSequence();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

		if(rcs==null){
			return;
		}
		
		String content = rcs.toString();
		
		Pattern pattern = Pattern.compile(this.PATTERN_A_HREF, Pattern.CASE_INSENSITIVE);
		Matcher matcher = pattern.matcher(content);
		
		while(matcher.find()){
			String newUrl = matcher.group(2);
			//符合笔记本
			if(newUrl.matches(this.PATTERN_360_NOTEBOOK)){
				System.err.println("————————————————————————————————符合笔记本——————————————————————————");
				//对newUrl进行修改
				newUrl = "http://www.360buy.com/products" + newUrl;
				this.addLindFromString(curi, newUrl, "", Link.NAVLINK_HOP);
			}
			
			//符合笔记本品牌
			if(newUrl.matches(this.PATTERN_360_NOTEBOOK_BRAND)){
				//对newUrl进行修改
				System.err.println("————————————————————————————————符合笔记本品牌——————————————————————————");
				newUrl = "http://www.360buy.com/products/" + newUrl;
				this.addLindFromString(curi, newUrl, "", Link.NAVLINK_HOP);
			}
		}

	}
	
	//将链接加入到候选链接中,如果在Frontier中也对这种格式的链接需要过滤却不能分辨哪些是否需要过滤,而这些链接
	//又必须要抓取,那么可以直接交给BdbFrontier依次上级调用
	private void addLindFromString(CrawlURI curi, String uri, CharSequence context, char hopType){
		try {
			curi.createAndAddLinkRelativeToBase(uri, context, hopType);
		} catch (URIException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}




  • 使用Heritrix爬取国内某最火的电子商城的笔记本信息遇到的问题
            
    
    博客分类: 搜索引擎 360HTML 
  • 大小: 19.1 KB
相关标签: 360 HTML