欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Jsoup爬虫 demo

程序员文章站 2022-04-01 23:13:38
...
pom.xml文件添加下面的内容
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>webContent</groupId>
  <artifactId>com.xly.webContent</artifactId>
  <version>0.0.1-SNAPSHOT</version>
      <repositories>  
        <repository>  
            <id>com.springsource.repository.bundles.release</id>  
            <name>EBR Spring Release Repository</name>  
            <url>http:// repository.springsource.com/maven/bundles/release</url>  
        </repository>  
        <repository>  
            <id>com.springsource.repository.bundles.external</id>  
            <name>EBR External Release Repository</name>  
            <url>http:// repository.springsource.com/maven/bundles/external</url>  
        </repository>  
    </repositories> 
       <properties>  
        <org.springframework.version>3.0.5.RELEASE</org.springframework.version>  
    </properties>
	<dependencies>
		<dependency>
			<!-- jsoup HTML parser library @ http://jsoup.org/ -->
			<groupId>org.jsoup</groupId>
			<artifactId>jsoup</artifactId>
			<version>1.5.2</version>
		</dependency>
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-core</artifactId>
			<version>${org.springframework.version}</version>
		</dependency>
		<dependency>
			<groupId>org.slf4j</groupId>
			<artifactId>slf4j-api</artifactId>
			<version>1.6.1</version>
		</dependency>

	</dependencies>
</project>


处理逻辑
package com.xly.jsoup;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.xly.jsoup.bean.WebInfoBean;
/**
 * 
 * 
 * @author Kaikai
 * @version $Id: WebContentMain.java, v 0.1 2014-10-26 上午9:49:32 Kaikai Exp $
 */
public class WebContentMain {

    public static final String BASE_URL="";

    public static final Logger log = LoggerFactory.getLogger(WebContentMain.class);

    static String base_url = "http://finance.sina.com.cn/";
    static String base_info_url="http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_AllNewsStock/symbol/";//sh600158.phtml
    static String sub_div_name="artibody";
    /**
     * @param args
     * @throws Exception
     */
    public static void main(String[] args) throws Exception {
        getDatelistBean(base_info_url+"sh600158.phtml","datelist");
    }

    /**
     * 从doc中获取<div class="datelist">
     * 对html的处理逻辑匹配新浪财经的url结构
     * @param doc
     */
    private static void getDatelistBean(String url,String divname) throws IOException 
    {
        Document doc = Jsoup.connect(url).get();
        Elements el =  doc.select("div[class="+divname+"]");
        Elements urls =  el.select("a[href]"); 
        List<WebInfoBean> list = new ArrayList<WebInfoBean>();
        for(Element e:urls){
            WebInfoBean bean = new WebInfoBean();
            Attributes attr= e.attributes();
            bean.setUrl(attr.get("href"));
            bean.setBaseUrl(e.baseUri());
            bean.setTitle(e.childNode(0).toString());
            try {
                String[] tmp = bean.getUrl().split("/");
                int lenght = tmp.length;
                if(lenght>4)bean.setTime(tmp[lenght-2]+tmp[lenght-1].substring(0,4));
            } catch (Exception e1) {
                System.out.println(bean.getUrl());
            }
            bean.setContent(extContent(bean.getUrl(),sub_div_name));
            list.add(bean);
        }
        save(list);
        
    }	
    
  /**
     * 解析内容
     * 
     * @param url
     * @param divname
     * @return
     * @throws IOException
     */
    private static String extContent(String url,String divname) throws IOException{
        Document doc = Jsoup.connect(url).get();
        Elements el =  doc.select("div[id="+divname+"]");
        Elements ps = el.select("p");
        String infoStr = "";
        for(Element e:ps){
            infoStr=infoStr+e.text()
//                    +"\n"
                    ;
        }
        return infoStr;
    }

    private static void save(List<WebInfoBean> list){
        for(WebInfoBean bean:list){
            System.out.println(bean.toString());
        }
    }

}


bean
public class WebInfoBean {

	private String id;
	
	private String title;
	private String index;
	private String time;
	private String content;
	private String baseUrl;
	private String url;
	
	public String getId() {
		return id;
	}
	public void setId(String id) {
		this.id = id;
	}
	public String getContent() {
		return content;
	}
	public void setContent(String content) {
		this.content = content;
	}
	public String getUrl() {
		return url;
	}
	public void setUrl(String url) {
		this.url = url;
	}
    public String getTitle() {
        return title;
    }
    public void setTitle(String title) {
        this.title = title;
    }
    public String getIndex() {
        return index;
    }
    public void setIndex(String index) {
        this.index = index;
    }
    public String getTime() {
        return time;
    }
    public void setTime(String time) {
        this.time = time;
    }
    public String getBaseUrl() {
        return baseUrl;
    }
    public void setBaseUrl(String baseUrl) {
        this.baseUrl = baseUrl;
    }
    @Override
    public String toString() {
        return "WebInfoBean [id=" + id + ", title=" + title + ", index=" + index + ", time=" + time
               + ", content=" + content + ", baseUrl=" + baseUrl + ", url=" + url + "]";
    }
	
	
}