Jsoup抓取
程序员文章站
2024-01-18 09:35:40
...
1.下载Jsoup核心库
2.使用
/** * 获取Document对象 * @param sourceUrl 连接地址 * @return doc Document对象 * @author chitianxiang $Feb 6th, 2012 */ static private Document getDocment(String sourceUrl) throws IOException { Connection conn = Jsoup.connect(sourceUrl); /* * 500错误 * 对方知道你是爬虫,直接给你拒绝访问, * header中要加入相关信息,稍微伪装下 */ conn.header("User-Agent", "Mozilla/5.0 (Macintosh; " + "U; Intel Mac OS X 10.4; en-US; " + "rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2"); return conn.timeout(MAX_CONNECT_TIME).get(); }
private static final int JSOUP_TIMEOUT_MAX_VAL = 10000; //Jsoup抓取时最长响应时间 private static Map<String, String[]> data; //数据源 private static final String SINA = "新浪"; private static final String IFENG = "凤凰网"; /** * 初始化加载数据源 * @author chitianxiang 2011/11/3 */ public static void init() throws Exception{ System.out.println("开始加载数据源..."); if (null != data && !data.isEmpty()) { return; } data = new HashMap<String, String[]>(); data.put("焦点新闻1", new String[]{SINA, "http://rss.sina.com.cn/news/china/politics15.xml"}); data.put("焦点新闻2", new String[]{IFENG, "http://news.ifeng.com/mainland/"}); } /** * 获取对应的数据源信息 * @param intactTypeName 完整类型名称 * @author chitianxiang 2011/11/3 */ public static List<String[]> getDataLst(String intactTypeName) throws Exception{ List<String[]> list = new ArrayList<String[]>(); if (null == data) { init(); } for (String str : data.keySet()) { if ((str.indexOf(intactTypeName) != -1) && (null != data.get(str))) { list.add(data.get(str)); } } return list; } /** * 抓取列表数据 * @param sourceName 网站名称 * @param sourceUrl 网址 * @param map 集合 * @author chitianxiang 2011/11/3 */ public static void doGrab2Lst(String sourceName, String sourceUrl, Map map) throws Exception{ if (SINA.equals(sourceName)) { grabLstBySina(sourceName, sourceUrl, map); } else if (IFENG.equals(sourceName)) { grabLstByIfeng(sourceName. sourceUrl, map); } } /** * 抓取新浪阅读列表数据 * @param sourceName 网站名称 * @param sourceUrl 网址 * @param map 集合 * @author chitianxiang 2011/11/5 */ private static void grabLstBySina(String sourceName, String sourceUrl, Map map) throws Exception{ try { Document doc = getDocument(sourceUrl); Elements elements = doc.select("item"); for (Element element : elements) { String title = element.select("title").text(); //标题 String content = element.select("description").text(); //显示内容 if ("".equals(showContent)) { continue; } String outUrl= element.select("guid").text(); //外链URl } } catch (Exception e) { System.out.println("抓取" + sourceName + "失败!!!"); } }