Java爬虫HttpClient与Jsoup
程序员文章站
2022-05-05 14:48:18
...
Java爬虫
爬取数据:根据一定的规则,自动抓取互联网信息
使用的Jar包
环境
-
JDK1.8
-
IDEA
-
Maven
Jar包
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.8</version>
</dependency>
一、一个简单Demo
我们模拟一个打开浏览器访问网页的方式,获取一点数据
注意:有些网页不能获取
package com.kj.Util;
import org.apache.http.HttpEntity;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
/**
发送一个最简单的请求
*/
public class HtmlParseUtil {
public static void main(String[] args) throws IOException {
String url = "https://www.itcast.cn";
// 1.打开浏览器。创建一个HttpClient对象
CloseableHttpClient aDefault = HttpClients.createDefault();
// 2. 输入网址。发起 get 请求
HttpGet httpGet = new HttpGet(url);
// 3. 按回车,发请求。使用httpclient发送请求
CloseableHttpResponse response = aDefault.execute(httpGet);
// 4. 解析响应
//获取状态码
if (response.getStatusLine().getStatusCode() == 200) {
HttpEntity entity = response.getEntity();
String s = EntityUtils.toString(entity, "utf-8");
System.out.println(s);
}
}
}
二、发送带参数的Get请求
package com.kj.Util;
import com.sun.javafx.fxml.builder.URLBuilder;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.net.URISyntaxException;
/**
* 带参数的 Get 请求
*/
public class HttpGet_2 {
public static void main(String[] args) throws URISyntaxException {
// 1.创建 httpclient 对象
CloseableHttpClient httpClient = HttpClients.createDefault();
// 2.url 重写
URIBuilder uriBuilder = new URIBuilder("http://yun.itheima.com/search");
// 3.设置参数
uriBuilder.setParameter("name","value");
// 4.创建 httpGet 对象
HttpGet httpGet = new HttpGet(uriBuilder.build());
CloseableHttpResponse response = null;
try {
// 5.发送请求
response = httpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode() == 200) {
HttpEntity entity = response.getEntity();
String s = EntityUtils.toString(entity, "utf-8");
System.out.println(s);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
三、发送带参数的Post请求
package com.kj.Util;
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class HttpPost_3 {
public static void main(String[] args) throws Exception {
// 1.创建 httpclient 对象
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpPost post = new HttpPost("http://yun.itheima.com/search");
// 2.声明 list 集合,封装表单中的参数
List<NameValuePair> params = new ArrayList<NameValuePair>();
params.add(new BasicNameValuePair("so", "java"));
// 3.创建表单的 entity 对象,参数一:post请求参数;参数二:字符集
UrlEncodedFormEntity entity = new UrlEncodedFormEntity(params, "utf-8");
// 4.将表单的 entity 设置到 post 请求中
post.setEntity(entity);
CloseableHttpResponse response = null;
try {
// 5.发送请求
response = httpClient.execute(post);
if (response.getStatusLine().getStatusCode() == 200) {
HttpEntity httpEntity = response.getEntity();
String s = EntityUtils.toString(httpEntity, "utf-8");
System.out.println(s);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
四、连接池
对比jdbc连接池,对HttpClient进行管理
package com.kj.Util;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class Pool_4 {
public static void main(String[] args) {
// 创建连接池管理
PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
// 设置链接数
cm.setMaxTotal(100);
// 设置每个主机最大链接数,设置访问主机的连接数大小。如:访问百度最多10个
cm.setDefaultMaxPerRoute(10);
// 使用连接池发送请求
doGet(cm);
}
private static void doGet(PoolingHttpClientConnectionManager cm) {
// 不是每次新创建httpclient,而是从连接池中创建httpclient
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
HttpGet get = new HttpGet("http://www.itcast.cn");
CloseableHttpResponse response = null;
try {
response = httpClient.execute(get);
if (response.getStatusLine().getStatusCode() == 200) {
String s = EntityUtils.toString(response.getEntity(), "utf-8");
System.out.println(s);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
五、网页分析
需要使用字符串处理工具
5.1、Html解析工具Jsoup
Java的html解析器,可以直接解析某个html地址、html文件内容。还提供了一套非常省力的Api,可以通过DOM,CSS以及类似jQuery的操作方式取出和操作数据。
maven依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<!--配合使用工具-->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.13</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.6</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.10</version>
</dependency>
5.2、简单的抓取数据
package com.kj.util;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.junit.Test;
import java.net.URL;
public class _6UrlForJsoup {
@Test
public void testUrl() throws Exception {
// 解析 url 地址
Document document = Jsoup.parse(new URL("http://www.bilibili.com/"), 10000);
// 获取 title 标签中的内容
String title = document.getElementsByTag("title").first().text();
System.out.println(title);
}
}
PS:虽然Jsoup可以代替HttpClient发送请求,不过实际开发中使用到多线程、连接池、代理等方式,而Jsoup对这些的支持并不好。所以我们一般把Jsoup当作html的解析工具使用。
5.3、解析html字符串
@Test
public void testHtml() throws Exception{
// 使用工具类获取字符串
String content = FileUtils.readFileToString(new File("./src/main/resources/KJ_Study.html"), "utf-8");
// 解析字符串
Document document = Jsoup.parse(content);
String title = document.getElementsByTag("title").first().text();
System.out.println(title);
}
5.4、解析文件
@Test
public void testFile() throws Exception{
// 解析文件
Document document = Jsoup.parse(new File("./src/main/resources/KJ_Study.html"),"utf-8");
String title = document.getElementsByTag("title").first().text();
System.out.println(title);
}
5.5、【进阶】从元素中获取数据
@Test
public void testData() throws Exception{
// 获取 Document
Document document = Jsoup.parse(new File("./src/main/resources/KJ_Study.html"),"utf-8");
Element element = document.getElementById("userSkin");
/*
<div id="userSkin">
<div class="user-skin-box">
<p class="user-skin-title">
<span>自定义博客皮肤<span class="vip-get">VIP专享<span></span>
.....
*/
// 1.获取 id
System.out.println(element.id());
// 2.获取 className
System.out.println(element.child(0).className());
// 有多个 className 可以通过 classNames() 方法
System.out.println(element.child(0).classNames());
// 3.获取属性值。利用属性名
System.out.println(element.attr("id"));
// 4.获取元素的所有属性
Attributes attributes = element.attributes();
System.out.println(attributes);
// 5.获取内容
Elements elementsByClass = document.getElementsByClass("user-skin-title");
System.out.println(elementsByClass.first().text());
}
5.6、【进阶】Selector选择器
@Test
public void testSeletor() throws Exception {
// 解析文件,获取 document
Document document = Jsoup.parse(new File("./src/main/resources/KJ_Study.html"), "utf-8");
// 通过标签名查找
Elements elements = document.select("span");
for (Element e : elements) {
System.out.println(e.text());
}
// #id:通过 id 查找
Elements e2 = document.select("#userSkin");
for (Element e : e2) {
System.out.println(e.child(0).className());
}
// .class:通过 class 名获取
Elements e3 = document.select(".user-skin-box");
for (Element e : e3) {
System.out.println(e.child(0).className());
}
// [attribute]:通过属性名获取
Elements e4 = document.select("[viewBox]");
for (Element e : e4) {
System.out.println(e.className());
}
// [attribute=value]:通过属性和属性值获取
Elements e5 = document.select("[t=1567152543821]");
for (Element e : e5) {
System.out.println(e.className());
}
}
5.7、【进阶】选择器组合使用
@Test
public void testSelecter2() throws Exception {
// 解析文件
Document document = Jsoup.parse(new File("./src/main/resources/KJ_Study.html"), "utf-8");
// el#id:元素+id
Elements elements = document.select("div#cropBox");
for (Element e : elements) {
System.out.println(e.child(0).className());
}
// el.class:元素+class
Elements e2 = document.select("span.close-bt");
for (Element e : e2) {
System.out.println(e.className());
}
// el[attr]:元素+属性名
Elements e3 = document.select("svg[t]");
for (Element e : e3) {
System.out.println(e.className());
}
// 任意组合
Elements e4 = document.select("svg[t].icon");
for (Element e : e4) {
System.out.println(e.className());
}
// ancestor child: 查找某个元素下子元素
Elements e5 = document.select("p[class] span");
for (Element e : e5) {
System.out.println(e.text());
}
// parent > child: 查找某个父元素下的直接子元素。直接就是直接相连的
Elements e6 = document.select("p[class] > span > svg");
for (Element e : e6) {
System.out.println(e.className());
}
// parent > *: 查找某个父元素下的所有直接子元素。
Elements e7 = document.select("p[class] > span > *");
for (Element e : e7) {
System.out.println(e.tagName());
}
}
推荐阅读
-
Java爬虫实现爬取京东上的手机搜索页面 HttpCliient+Jsoup
-
Java 实现 HttpClients+jsoup,Jsoup,htmlunit,Headless Chrome 爬虫抓取数据
-
java爬虫Jsoup简单学习
-
Java爬虫--利用HttpClient和Jsoup爬取博客数据并存入数据库
-
Java爬虫系列二:使用HttpClient抓取页面HTML
-
Java爬虫系列三:使用Jsoup解析HTML
-
网络爬虫1之HttpClient抓取数据、Jsoup解析数据
-
Java开发笔记(一百一十三)HttpClient实现下载与上传
-
java使用Jsoup简单爬虫
-
httpclient.jar与系统包冲突 java.lang.NoSuchFieldError: No static field INSTANCE of type Lorg/apache/http/c