大数据正式京淘
程序员文章站
2022-04-01 20:18:13
大数据正式京淘
爬虫技术
httpClient:抓取整个页面
htmlUnit:可以二次提交
jsoup:可以获取以上两个技术的所有内容
jsoup
爬取整个页面
爬取整个网...
大数据正式京淘
爬虫技术
httpClient:抓取整个页面
htmlUnit:可以二次提交
jsoup:可以获取以上两个技术的所有内容
jsoup
爬取整个页面
爬取整个网站
爬取页面中的某一个定位信息
爬取二次提交--ajax
爬取jsonp数据
例子
测试之前的准备
private ObjectMapper om;
@org.junit.Before
public void beforeTest() {
om = new ObjectMapper();
}
整个页面
// 整个页面
@org.junit.Test
public void pageTest() throws Exception {
String url = "https://www.jd.com/";
Response response = Jsoup.connect(url).execute();
System.err.println(response.body());
}
整个网站
// 整个网站
@org.junit.Test
public void siteTest() throws Exception {
String url = "https://www.jd.com/";
Document doc = Jsoup.connect(url).get();
// 寻找a标签
Elements eles = doc.getElementsByTag("a");
// 打印地址
for (Element ele : eles) {
String href = ele.attr("href");
System.err.println("链接地址:" + href);
}
}
某一定位信息
// 具体的定位
@org.junit.Test
public void sitePositionTest() throws Exception {
String url = "https://www.jd.com/";
Document doc = Jsoup.connect(url).get();
// 寻找a标签---用空格隔开父子
Elements eles = doc.select("#navitems-group2 .fore2 a");
// 打印地址
for (Element ele : eles) {
String href = ele.attr("href");
System.err.println("链接地址:" + href);
}
}
形如ajax二次提交的数据
// ajax--二次提交
@org.junit.Test
public void ajaxTest() throws Exception {
String url = "http://p.3.cn/prices/mgets?skuIds=J_5089253";
Response response = Jsoup.connect(url).ignoreContentType(true).execute();
String body = response.body();
JsonNode readTree = om.readTree(body);
JsonNode priceNode = readTree.get(0).get("p");
System.err.println(priceNode.asText());
}
jsonp数据的获取
// ajax--二次提交
@org.junit.Test
public void ajaxTest() throws Exception {
String url = "http://p.3.cn/prices/mgets?skuIds=J_5089253";
Response response = Jsoup.connect(url).ignoreContentType(true).execute();
String body = response.body();
JsonNode readTree = om.readTree(body);
JsonNode priceNode = readTree.get(0).get("p");
System.err.println(priceNode.asText());
}
抓取京东
抓取京东
抓的是什么
对象:Item
抓取步骤
找到三级分类
当前分类的总页数的处理,获取三级分类下的所有的分类链接
找到每一页中的所有的链接地址
每个商品信息封装数据
例子
item
package com.peng.pojo;
public class Item {
private Long id;// 商品ID
private String ;// 商品标题
private String desc;// 商品描述
private String sellPoint;// 商品的买点
private Long price;// 商品价格
private String Images;// 商品的图片【格式:大图1###小图1***大图2###小图2***大图3###小图3】--记得拆分
public Long getId() {
return id;
}
public void setId(Long id) {
this.id = id;
}
public String get() {
return ;
}
public void set(String ) {
this. = ;
}
public String getDesc() {
return desc;
}
public void setDesc(String desc) {
this.desc = desc;
}
public String getSellPoint() {
return sellPoint;
}
public void setSellPoint(String sellPoint) {
this.sellPoint = sellPoint;
}
public Long getPrice() {
return price;
}
public void setPrice(Long price) {
this.price = price;
}
public String getImages() {
return Images;
}
public void setImages(String images) {
Images = images;
}
@Override
public String toString() {
return "Item [id=" + id + ", =" + + ", desc=" + desc + ", sellPoint=" + sellPoint + ", price="
+ price + ", Images=" + Images + "]";
}
}
具体爬虫数据
package com.peng.util;
import java.util.ArrayList;
import java.util.List;
import org.apache.log4j.Logger;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.peng.pojo.Item;
public class JDutils {
private static final String JDTHREE_URL = "http://www.jd.com/allSort.aspx";// 京东的所有商品的网址
private static final Logger log = Logger.getLogger(JDutils.class);// 日志对象
private static final ObjectMapper om = new ObjectMapper();// om对象
// 获取三级分类的所有地址
public static List getCarUrls(String url) throws Exception {
List catUrls = new ArrayList();
Elements eles = Jsoup.connect(url).get().select("div dl dd a");
for (Element ele : eles) {
String catUrl = "http:" + ele.attr("href");
//
if (catUrl.startsWith("http://list.jd.com/list.html")) {
catUrls.add(catUrl);
}
}
// 获取当前分类的页数
return catUrls;
}
// 获取当前连接的页数,拼接获取所有页数的链接
public static List getPageUrls(String url) {
List pageUrlLists = new ArrayList();
try {
String pageNum = Jsoup.connect(url).get().select("#J_topPage span i").get(0).text();
Integer page = Integer.parseInt(pageNum);
for (int i = 1; i < page; i++) {
String pageUrl = url + "&page=" + i;
pageUrlLists.add(pageUrl);
}
} catch (Exception e) {
log.error(e.getMessage());
}
return pageUrlLists;
}
// 抓取每一页下的所有商品链接
public static List getItemUrls(String url) {
List itemLists = new ArrayList();
try {
Elements eles = Jsoup.connect(url).get().select(".j-sku-item .p-name a");
for (Element ele : eles) {
String itemUrl = "http:" + ele.attr("href");
itemLists.add(itemUrl);
}
} catch (Exception e) {
log.error(e.getMessage());
}
return itemLists;
}
// 测试能否链接获取所有链接
public static List getAllItemUrls(String url) {
List allItemLists = new ArrayList();
long startTime = System.currentTimeMillis();
try {
int i = 1;
for (String catUrl : JDutils.getCarUrls(url)) {
for (String pageUrl : JDutils.getPageUrls(catUrl)) {
allItemLists.addAll(JDutils.getItemUrls(pageUrl));
}
i++;
if (i == 2) {
// 测试:循环两个链接
break;
}
}
} catch (Exception e) {
log.error(e.getMessage());
}
long endTime = System.currentTimeMillis();
System.err.println(allItemLists);
System.err.println("SIZE:" + allItemLists.size());
System.err.println("TIME:" + ((endTime - startTime) / 1000.0) + "秒");
return allItemLists;
}
// 获取标题
public static String get(String url) {
String = "";
try {
= Jsoup.connect(url).get().select(".m-item-inner").select("#itemInfo #name h1").get(0).text();
} catch (Exception e) {
log.error(e.getMessage());
}
return ;
}
// 获取卖点
public static String getSellPoint(Long itemId) {
String sellPoint = "";
String url = "http://ad.3.cn/ads/mgets?skuIds=AD_" + itemId;
try {
Response response = Jsoup.connect(url).ignoreContentType(true).execute();
String body = response.body();
JsonNode readTree = om.readTree(body);
JsonNode sellPointNode = readTree.get(0).get("ad");
sellPoint = sellPointNode.asText();
System.err.println(sellPoint);
} catch (Exception e) {
log.error(e.getMessage());
}
return sellPoint;
}
// 获取价格
public static Long getPrice(Long itemId) {
Long price = null;
String url = "http://p.3.cn/prices/mgets?skuIds=J_" + itemId;
try {
Response response = Jsoup.connect(url).ignoreContentType(true).execute();
String body = response.body();
JsonNode readTree = om.readTree(body);
JsonNode sellPointNode = readTree.get(0).get("p");
price = Long.parseLong(sellPointNode.asText().replace(".", ""));
} catch (Exception e) {
log.error(e.getMessage());
}
return price;
}
// 商品的图片【格式:大图1###小图1***大图2###小图2***大图3###小图3】--记得拆分
public static String getImages(String url) {
String images = "";
try {
// 大图
Element bigEle = Jsoup.connect(url).get().select("#spec-n1 img").get(0);
// 大图的全路径模板
String bigUrl = "http:" + bigEle.attr("src");
String bigImagePrefix = bigUrl.substring(0, bigUrl.indexOf("jfs"));
// 小图
Elements eles = Jsoup.connect(url).get().select("#spec-list ul li img");
for (Element ele : eles) {
String smallImageUrl = ele.attr("src");
// 小图全路径
String smallImageAllUrl = "http:" + smallImageUrl;
// 后面的内容
String smallImageSuffFix = smallImageUrl.substring(smallImageUrl.indexOf("jfs"));
// 大图的全路径
String bigImageAll = bigImagePrefix + smallImageSuffFix;
// 路径拼接
images = bigImageAll + "###" + smallImageAllUrl + "***";
}
System.err.println(images);
} catch (Exception e) {
log.error(e.getMessage());
}
return images;
}
// 商品详情
public static String getDesc(Long itemId) {
String desc = "";
String url = "http://d.3.cn/desc/" + itemId;
try {
String body = Jsoup.connect(url).ignoreContentType(true).execute().body();
// 切分成json格式
String jsonString = body.substring(body.indexOf('(') + 1, body.lastIndexOf(')'));
// 将字符串转换为json格式的数据
JsonNode readTree = om.readTree(jsonString);
desc = readTree.get("content").asText();
// 拿数据
} catch (Exception e) {
log.error(e.getMessage());
}
return desc;
}
// 商品的封装
public static Item getItem(String url) {
Item item = new Item();
try {
item.setId(Long.parseLong(url.substring(url.lastIndexOf('/') + 1, url.lastIndexOf('.'))));
item.setDesc(JDutils.getDesc(item.getId()));
item.setImages(JDutils.getImages(url));
item.setPrice(JDutils.getPrice(item.getId()));
item.setSellPoint(JDutils.getSellPoint(item.getId()));
item.set(JDutils.get(url));
System.err.println(item);
} catch (Exception e) {
log.error(e.getMessage());
}
return item;
}
@Test
public void test() throws Exception {
List urls = getAllItemUrls(JDutils.JDTHREE_URL);
for (String url : urls) {
JDutils.getItem(url);
}
}
}