java爬虫webmagic 案例爬取动态(ajax+js) 网站京东售价格
程序员文章站
2022-05-05 15:41:26
...
代码结构共3个,
爬取京东手机ID与名称
爬取京东手机ID与价格
-
组织json
-
为啥没合并在一起,原因:其中有个组织价格URL的过程
项目采用maven管理
工程pom.xml文件见下一个博文
-
package org.study.WebMagicStudy;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
public class JDAjaxProcessor implements PageProcessor {
public static final String URL_LIST = "http://list\\.jd\\.com/list\\.html\\?cat=9987,653,655&page=\\d+\\&go=0\\&JL=6_0_0";
//用于存储{key:手机ID,value:手机名称}
static Map<String,String> map = new HashMap<String, String>();
static Set<String> uri = new HashSet<String>();
public static void main(String[] args) {
String list = "http://list.jd.com/list.html?cat=9987,653,655&page=1&go=0&JL=6_0_0";
Spider.create(new JDAjaxProcessor()).addUrl(list)
.addPipeline(new FilePipeline("D:\\webmagic\\"))
.run();
for (String s : map.values()) {
System.out.println(s);
}
System.out.println("map-->" + map.size());
System.out.println(map.get("10274956063"));
}
private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
public Site getSite() {
return site;
}
public void process(Page page) {
if (page.getUrl().regex(URL_LIST).match()) {
// page.setSkip(true);
page.putField("id",page.getHtml().xpath("//div[@class='p-focus']/a/@data-sku").all());
page.putField("name",page.getHtml().xpath("//div[@class='p-name']/a/em/text()").all());
List<String> ids = (List<String>) page.getResultItems().get("id");
List<String> name = (List<String>) page.getResultItems().get("name");
String makerUrl = makerUrl(ids);
// System.out.println("价格连接" + makerUrl);
//key:id,value:price
Map<String, String> running = JDJsonPreocessor.running(makerUrl);
for (int i = 0; i < name.size(); i++) {
String price = running.get("J_"+ids.get(i));
map.put(ids.get(i), name.get(i) +"\t"+ price);
}
page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all());
}
}
public String makerUrl(List<String> ids){
StringBuffer sb = new StringBuffer();
for (String id : ids) {
sb.append("J_"+id+",");
}
String substring = sb.substring(0, sb.length()-1);
return "http://p.3.cn/prices/mgets?skuIds="+substring+"&callback=result";
}
public void writeFile(){
}
}
package org.study.WebMagicStudy;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* Hello world!
*
*/
public class JDJsonPreocessor implements PageProcessor {
static Map<String,String> maps = new HashMap<String, String>();
public static Map<String,String> running(String url) {
Spider.create(new JDJsonPreocessor()).addUrl(url).run();
return maps;
}
private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
public Site getSite() {
return site;
}
public void process(Page page) {
page.setSkip(true);
String text = page.getRawText();
int begin = text.indexOf("[");
int end = text.indexOf("]");
String substring = text.substring(begin, end + 1);
String jsonName = "result";
String json = "{\"" + jsonName + "\":" + substring + "}";
Map<String, Object> map = JsonUtil.jsonToMap(json);
List<Map<String, Object>> list = (List<Map<String, Object>>) map.get(jsonName);
for (Map<String, Object> map1 : list) {
String key = map1.get("id").toString();
String value = map1.get("p").toString();
maps.put(key, value);
}
}
}
package org.study.WebMagicStudy;
import java.util.List;
import java.util.Map;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
/**
* json 工具类
*
*/
public class JsonUtil {
/**
* 转换json 字符串为map
* @param jsonStr
* @return
*/
@SuppressWarnings("unchecked")
public static Map<String,Object> jsonToMap(String jsonStr){
ObjectMapper om = new ObjectMapper();
Map<String, Object> map = null;
try {
map = om.readValue(jsonStr, Map.class);
} catch (Exception e) {
e.printStackTrace();
}
return map;
}
/**
* 将一个map转换成JSON对象
* @param param
* @return
*/
public static String mapToJson(Map<String,Object> m) {
ObjectMapper om = new ObjectMapper();
String json="";
try {
json = om.writeValueAsString(m);
} catch (Exception e) {
e.printStackTrace();
}
return json;
}
/**
* 将一个list转换成json串,转换后格式
* [{key:'value'},{key:'value'},{key:'value'}...]
* @param param
* @return
*/
public static String listToJson(List<Map<String,Object>> list) {
ObjectMapper om = new ObjectMapper();
String json = "";
try {
json = om.writeValueAsString(list);
} catch (JsonProcessingException e) {
e.printStackTrace();
}
//System.out.println(json);
return json;
}
/**
* 将一个array类型的json串转换成list
* @param jsonStr
* @return
*/
@SuppressWarnings("unchecked")
public static List<Map<String,Object>> jsonToList(String jsonStr){
ObjectMapper om = new ObjectMapper();
List<Map<String, Object>> list = null;
try {
list = om.readValue(jsonStr, List.class);
} catch (Exception e) {
e.printStackTrace();
}
return list;
}
}
转载于:https://my.oschina.net/yimi88/blog/663821
上一篇: pipeline(3)
下一篇: Scrapy抓取数据存储到Excel