爬虫 java 百度图片
程序员文章站
2024-02-26 09:39:52
...
记录一下java 写的爬虫软件下载百度图片
具体流程如下
- 找到对应的链接
- 解析链接的url
- 下载
1.查询百度图片的地址
打开百度图片,然后搜索图片,F12打开通知他,切换到XHR可以看到有一个登陆信息的链接
这个时候我们继续滑动百度图片
就会发型他多了其他的链接,选择链接,查看对应的属性,可以看到他的返回值
然后通过对比我们可以发现。启动pn是控制页数的,queryWord是控制查询关键字的
这个时候我们就获取到对应的链接了,
2.解析返回值
在解析返回值之前我们得通过链接获取到对应的返回值,具体代码如下
public static String doGetOhter(String url) throws Exception {
// TODO Auto-generated method stub
//1,创建一个httpClient对象
CloseableHttpClient client = HttpClients.createDefault();
//2,创建uriBuilder 对于httpClient4.3访问指定页面url必须要使用http://开始
URIBuilder uriBuilder = new URIBuilder(url);
//4,创建httpget对象
HttpGet httpGet = new HttpGet(uriBuilder.build());
//5,设置请求报文头部的编码
httpGet.setHeader(new BasicHeader("Content-Type", "application/x-www-form-urlencoded; utf-8"));
//6,设置期望服务返回的编码
httpGet.setHeader(new BasicHeader("Accept", "text/plain;charset=utf-8"));
//7,请求服务
CloseableHttpResponse response = client.execute(httpGet);
//8,获取请求返回码
int statusCode = response.getStatusLine().getStatusCode();
//9如果请求返回码是200,则说明请求成功
String str = "";
if (statusCode == 200) {
//10,获取返回实体
HttpEntity entity = response.getEntity();
//11,通过EntityUtils的一个工具类获取返回的内容
str = EntityUtils.toString(entity);
} else {
System.out.println("请求失败!");
}
response.close();
client.close();
return str;
}
这个时候我们就可以解析返回值
这里我使用的是 com.alibaba.fastjson
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
</dependency>
具体代码如下
/**
* @return void
* @Author yangyue
* @Description //TODO 查询百度图片
* @Date 14:15 2019/11/14
* @Param [Qury, bastnuber] 输入查询的关键词和下载的数量/30
**/
public static void downloadBiaduPic(String Qury, int bastnuber) {
for (int i = 1; i < bastnuber+1; i++) {
String url="https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord="+Qury+"&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=©right=&word="+Qury+"&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn="+i*30+"&rn=30&gsm=&666=";
String josnString= null;
try {
josnString = doGetOhter(url);
List<String> resultList = new ArrayList<String>();
//将JSON转化为Map
Map mapString = (Map) JSON.parse(josnString);
JSONArray jsonArray = JSONObject.parseArray(mapString.get("data").toString());
for (int g = 0; g < jsonArray.size(); g++) {
JSONObject partDaily = jsonArray.getJSONObject(g);
String jsonEntrylinks = partDaily.getString("thumbURL");
wgetDown(jsonEntrylinks); //传给下载
}
} catch (Exception e) {
e.printStackTrace();
}
System.out.println(josnString);
}
}
3.下载
这个时候获取到下载链接,直接下载就行,这里我使用的wget下载
public static String wgetDown(String downUrl) {
try {
String downulr = " cmd /c start D:\\wgetfile\\wget.exe \""+downUrl+"\" -P D:\\tmp\\WetPic\\";
Process process = Runtime.getRuntime().exec(downulr);
if (process != null) {
process.getOutputStream().close();
}
InputStream in = process.getInputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String tmp = null;
while ((tmp = br.readLine()) != null) {
// do nothing...啥也不干
}
Thread.sleep(500);
//下载完成
} catch (Exception e) {
e.printStackTrace();
return "下载出错";
}
return null;
}
其中需要一个
全部代码如下
public static void main(String[] args) {
downloadBiaduPic("海贼王", 1);
}
/**
* @return void
* @Author yangyue
* @Description //TODO 查询百度图片
* @Date 14:15 2019/11/14
* @Param [Qury, bastnuber]
**/
public static void downloadBiaduPic(String Qury, int bastnuber) {
for (int i = 1; i < bastnuber+1; i++) {
String url="https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord="+Qury+"&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=©right=&word="+Qury+"&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn="+i*30+"&rn=30&gsm=&666=";
String josnString= null;
try {
josnString = doGetOhter(url);
List<String> resultList = new ArrayList<String>();
//将JSON转化为Map
Map mapString = (Map) JSON.parse(josnString);
JSONArray jsonArray = JSONObject.parseArray(mapString.get("data").toString());
for (int g = 0; g < jsonArray.size(); g++) {
JSONObject partDaily = jsonArray.getJSONObject(g);
String jsonEntrylinks = partDaily.getString("thumbURL");
wgetDown(jsonEntrylinks);
}
} catch (Exception e) {
e.printStackTrace();
}
System.out.println(josnString);
}
}
public static String wgetDown(String downUrl) {
try {
String downulr = " cmd /c start D:\\wgetfile\\wget.exe \""+downUrl+"\" -P D:\\tmp\\WetPic\\";
Process process = Runtime.getRuntime().exec(downulr);
if (process != null) {
process.getOutputStream().close();
}
InputStream in = process.getInputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String tmp = null;
while ((tmp = br.readLine()) != null) {
// do nothing...啥也不干
}
Thread.sleep(500);
//下载完成
} catch (Exception e) {
e.printStackTrace();
return "下载出错";
}
return null;
}
public static String doGetOhter(String url) throws Exception {
// TODO Auto-generated method stub
//1,创建一个httpClient对象
CloseableHttpClient client = HttpClients.createDefault();
//2,创建uriBuilder 对于httpClient4.3访问指定页面url必须要使用http://开始
URIBuilder uriBuilder = new URIBuilder(url);
//4,创建httpget对象
HttpGet httpGet = new HttpGet(uriBuilder.build());
//5,设置请求报文头部的编码
httpGet.setHeader(new BasicHeader("Content-Type", "application/x-www-form-urlencoded; utf-8"));
//6,设置期望服务返回的编码
httpGet.setHeader(new BasicHeader("Accept", "text/plain;charset=utf-8"));
//7,请求服务
CloseableHttpResponse response = client.execute(httpGet);
//8,获取请求返回码
int statusCode = response.getStatusLine().getStatusCode();
//9如果请求返回码是200,则说明请求成功
String str = "";
if (statusCode == 200) {
//10,获取返回实体
HttpEntity entity = response.getEntity();
//11,通过EntityUtils的一个工具类获取返回的内容
str = EntityUtils.toString(entity);
} else {
System.out.println("请求失败!");
}
response.close();
client.close();
return str;
}
下一篇: Raspberry Pi 3B学习笔记