爬虫入门案例
需求: 模拟登陆慢慢买这个网址, 登陆成功以后, 获取当前用户的积分信息
登陆的url: http://home.manmanbuy.com/login.aspx
访问这个url的相关参数:
__VIEWSTATE __EVENTVALIDATION txtUser txtPass btnLogin
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
// 模拟登陆
public class ManManSpider {
public static void main(String[] args) throws Exception {
//1. 确定首页URL
String loginUrl = "http://home.manmanbuy.com/login.aspx";
//2. 发送请求, 获取数据
//2.1 创建httpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//2.2 设置请求方式
HttpPost httpPost = new HttpPost(loginUrl);
//2.3 封装请求参数
List<BasicNameValuePair> list = new ArrayList<BasicNameValuePair>();
list.add(new BasicNameValuePair("__VIEWSTATE","/wEPDwULLTIwNjQ3Mzk2NDFkGAEFHl9fQ29udHJvbHNSZXF1aXJlUG9zdEJhY2tLZXlfXxYBBQlhdXRvTG9naW4voj01ABewCkGpFHsMsZvOn9mEZg=="));
list.add(new BasicNameValuePair("__EVENTVALIDATION","/wEWBQLW+t7HAwLB2tiHDgLKw6LdBQKWuuO2AgKC3IeGDJ4BlQgowBQGYQvtxzS54yrOdnbC"));
list.add(new BasicNameValuePair("txtUser","itcast"));
list.add(new BasicNameValuePair("txtPass","www.itcast.cn"));
list.add(new BasicNameValuePair("btnLogin","登陆"));
HttpEntity entity = new UrlEncodedFormEntity(list);
httpPost.setEntity(entity);
//2.4 封装请求头: referer
httpPost.setHeader("Referer","http://home.manmanbuy.com/login.aspx");
//2.5 发送请求, 获取响应对象
CloseableHttpResponse response = httpClient.execute(httpPost);
//2.6 获取数据
//2.6.1 :状态码
int statusCode = response.getStatusLine().getStatusCode();
if(statusCode==302){
//登陆成功, 获取重定向URL
Header[] locations = response.getHeaders("Location");
String reUrl = locations[0].getValue();
reUrl = "http://home.manmanbuy.com"+reUrl;
Header[] cookies = response.getHeaders("Set-Cookie");
//System.out.println(headers.length);
httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(reUrl);
//封装登陆成功的cookie标识信息
httpGet.setHeader("Cookie",cookies[0].getValue()+" "+cookies[1].getValue());
//重定向后的response的对象
response = httpClient.execute(httpGet);
//重定向后的页面的数据
String html = EntityUtils.toString(response.getEntity(), "UTF-8");
//解析HTML的数据
Document document = Jsoup.parse(html);
Elements jiFenEl = document.select("#aspnetForm > div.udivright > div:nth-child(2) > table > tbody > tr > td:nth-child(1) > table:nth-child(2) > tbody > tr > td:nth-child(2) > div:nth-child(1) > font");
System.out.println(jiFenEl.text());
}
}
}