Java 爬虫模拟知乎登陆
程序员文章站
2024-03-17 21:35:04
...
Java 爬虫模拟知乎登陆
一、知乎登陆页面分析
1、我用的是Chrome浏览器 按F12打开开发者工具 选到NetWork 然后登陆一次观察在登陆过程中所用的URL及post了哪些参数
上传的参数如下:
2、模拟登陆知乎分为以下几步:
1)获取_xsrf
2)获取验证码 captcha
3)登陆知乎,拿到cookie
4)使用cookie进一步访问登陆后的其他页面
二、Java 模拟知乎登陆
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.LinkedList;
import java.util.List;
import java.util.Scanner;
import javax.imageio.ImageIO;
import org.apache.http.Consts;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.StatusLine;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
public class LogIn {
private String indexURL = "https://www.zhihu.com/";
private String loginURL = "https://www.zhihu.com/login/email";
private String captchaURL = "https://www.zhihu.com/captcha.gif?type=login";
protected RequestConfig requestConfig = null;
protected CloseableHttpClient httpClient = null;
public LogIn(String indexURL, String loginURL, String captchaURL) {
super();
this.indexURL = indexURL;
this.loginURL = loginURL;
this.captchaURL = captchaURL;
requestConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD_STRICT).build();
httpClient = HttpClients.custom().setDefaultRequestConfig(requestConfig).build();
}
public LogIn() {
super();
requestConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD_STRICT).build();
httpClient = HttpClients.custom().setDefaultRequestConfig(requestConfig).build();
}
//获取XSRF
public String getXSRF() throws ClientProtocolException, IOException{
HttpGet get = new HttpGet(indexURL);
get.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36");
get.setHeader("Accept","*/*");
get.setHeader("Accept-Encoding","gzip,deflate,br");
get.setHeader("Accept-Language","zh-CN,zh;q=0.8");
get.setHeader("Origin","https://www.zhihu.com");
get.setHeader("Referer","https://www.zhihu.com/");
CloseableHttpResponse response = httpClient.execute(get);
String responseHtml = EntityUtils.toString(response.getEntity());
String xsrfValue = responseHtml.split("<input type=\"hidden\" name=\"_xsrf\" value=\"")[1].split("\"/>")[0];
return xsrfValue;
}
//获取验证码
public String getCaptcha() throws ClientProtocolException, IOException{
CloseableHttpResponse response = httpClient.execute(new HttpGet(captchaURL));
InputStream input= response.getEntity().getContent();
BufferedImage bio = ImageIO.read(input);
File w2 = new File("src/QQ.jpg");
ImageIO.write(bio, "jpg", w2);
input.close();
response.close();
String captcha =null;
Scanner s = new Scanner(System.in);
System.out.print("captcha:");
captcha = s.nextLine();
s.close();
return captcha;
}
//获取登陆后的响应状态,包含cookie信息
public HttpResponse logIn(String email,String password) throws ClientProtocolException, IOException{
List<NameValuePair> valuePairs = new LinkedList<NameValuePair>();
valuePairs.add(new BasicNameValuePair("_xsrf", getXSRF()));
valuePairs.add(new BasicNameValuePair("email", email));
valuePairs.add(new BasicNameValuePair("password", password));
valuePairs.add(new BasicNameValuePair("captcha", getCaptcha()));
UrlEncodedFormEntity entity = new UrlEncodedFormEntity(valuePairs, Consts.UTF_8);
HttpPost post = new HttpPost(loginURL);
post.setEntity(entity);
HttpResponse httpResponse = httpClient.execute(post);
return httpResponse;
}
//根据cookie信息,访问其他页面
public String visitURL(HttpResponse httpResponse, String url) throws ClientProtocolException, IOException{
HttpGet get = new HttpGet("http://www.zhihu.com/question/following");
Header[] headers = httpResponse.getHeaders("Set-Cookie");
for(int i =0 ;i<headers.length;i++){
get.addHeader(headers[i]);
}
CloseableHttpResponse r = httpClient.execute(get);
String content = EntityUtils.toString(r.getEntity());
System.out.println(content);
r.close();
return null;
}
/**
* @param args
* @throws IOException
* @throws ClientProtocolException
*/
public static void main(String[] args) throws ClientProtocolException, IOException {
LogIn login = new LogIn();
HttpResponse httpResponse = login.logIn("aaa@qq.com", "xxxxxxx");
StatusLine responseState = httpResponse.getStatusLine();
System.out.println(responseState.toString());
Header[] headers = httpResponse.getAllHeaders();
for(int i =0 ;i<headers.length;i++){
System.out.println(headers[i].getName()+": "+headers[i].getValue());
}
HttpEntity httpEntiey = httpResponse.getEntity();
String responseString = EntityUtils.toString(httpEntiey);
System.out.println(responseString);
// {"r":0,
// "msg": "\u767b\u5f55\u6210\u529f"
// }
login.visitURL(httpResponse,"http://www.zhihu.com/question/following");
}
}
运行效果