Jsoup爬虫 demo
程序员文章站
2022-04-01 23:13:38
...
pom.xml文件添加下面的内容
处理逻辑
bean
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>webContent</groupId> <artifactId>com.xly.webContent</artifactId> <version>0.0.1-SNAPSHOT</version> <repositories> <repository> <id>com.springsource.repository.bundles.release</id> <name>EBR Spring Release Repository</name> <url>http:// repository.springsource.com/maven/bundles/release</url> </repository> <repository> <id>com.springsource.repository.bundles.external</id> <name>EBR External Release Repository</name> <url>http:// repository.springsource.com/maven/bundles/external</url> </repository> </repositories> <properties> <org.springframework.version>3.0.5.RELEASE</org.springframework.version> </properties> <dependencies> <dependency> <!-- jsoup HTML parser library @ http://jsoup.org/ --> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.5.2</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-core</artifactId> <version>${org.springframework.version}</version> </dependency> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-api</artifactId> <version>1.6.1</version> </dependency> </dependencies> </project>
处理逻辑
package com.xly.jsoup; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Attributes; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.xly.jsoup.bean.WebInfoBean; /** * * * @author Kaikai * @version $Id: WebContentMain.java, v 0.1 2014-10-26 上午9:49:32 Kaikai Exp $ */ public class WebContentMain { public static final String BASE_URL=""; public static final Logger log = LoggerFactory.getLogger(WebContentMain.class); static String base_url = "http://finance.sina.com.cn/"; static String base_info_url="http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_AllNewsStock/symbol/";//sh600158.phtml static String sub_div_name="artibody"; /** * @param args * @throws Exception */ public static void main(String[] args) throws Exception { getDatelistBean(base_info_url+"sh600158.phtml","datelist"); } /** * 从doc中获取<div class="datelist"> * 对html的处理逻辑匹配新浪财经的url结构 * @param doc */ private static void getDatelistBean(String url,String divname) throws IOException { Document doc = Jsoup.connect(url).get(); Elements el = doc.select("div[class="+divname+"]"); Elements urls = el.select("a[href]"); List<WebInfoBean> list = new ArrayList<WebInfoBean>(); for(Element e:urls){ WebInfoBean bean = new WebInfoBean(); Attributes attr= e.attributes(); bean.setUrl(attr.get("href")); bean.setBaseUrl(e.baseUri()); bean.setTitle(e.childNode(0).toString()); try { String[] tmp = bean.getUrl().split("/"); int lenght = tmp.length; if(lenght>4)bean.setTime(tmp[lenght-2]+tmp[lenght-1].substring(0,4)); } catch (Exception e1) { System.out.println(bean.getUrl()); } bean.setContent(extContent(bean.getUrl(),sub_div_name)); list.add(bean); } save(list); } /** * 解析内容 * * @param url * @param divname * @return * @throws IOException */ private static String extContent(String url,String divname) throws IOException{ Document doc = Jsoup.connect(url).get(); Elements el = doc.select("div[id="+divname+"]"); Elements ps = el.select("p"); String infoStr = ""; for(Element e:ps){ infoStr=infoStr+e.text() // +"\n" ; } return infoStr; } private static void save(List<WebInfoBean> list){ for(WebInfoBean bean:list){ System.out.println(bean.toString()); } } }
bean
public class WebInfoBean { private String id; private String title; private String index; private String time; private String content; private String baseUrl; private String url; public String getId() { return id; } public void setId(String id) { this.id = id; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getIndex() { return index; } public void setIndex(String index) { this.index = index; } public String getTime() { return time; } public void setTime(String time) { this.time = time; } public String getBaseUrl() { return baseUrl; } public void setBaseUrl(String baseUrl) { this.baseUrl = baseUrl; } @Override public String toString() { return "WebInfoBean [id=" + id + ", title=" + title + ", index=" + index + ", time=" + time + ", content=" + content + ", baseUrl=" + baseUrl + ", url=" + url + "]"; } }