欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Jsoup抓取

程序员文章站 2024-01-18 09:35:40
...

1.下载Jsoup核心库

 

地址: http://jsoup.org/download

 

Jsoup抓取
            
    
    博客分类: jsoup jsoup抓取 

 

2.使用

  /**
	 * 获取Document对象
	 * @param sourceUrl		连接地址
	 * @return doc 			Document对象
	 * @author chitianxiang $Feb 6th, 2012
	 */
	static private Document getDocment(String sourceUrl) throws IOException {
		Connection conn = Jsoup.connect(sourceUrl);
		/*
		 * 500错误
		 * 对方知道你是爬虫,直接给你拒绝访问,
		 * header中要加入相关信息,稍微伪装下
		 */
		conn.header("User-Agent", "Mozilla/5.0 (Macintosh; " 
				+ "U; Intel Mac OS X 10.4; en-US; " 
				+ "rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2"); 
		
		return conn.timeout(MAX_CONNECT_TIME).get();
	}
 
	private static final int JSOUP_TIMEOUT_MAX_VAL = 10000;	//Jsoup抓取时最长响应时间
	private static Map<String, String[]> data; //数据源

	private static final String SINA = "新浪";
	private static final String IFENG = "凤凰网";

  /**
	 * 初始化加载数据源
	 * @author chitianxiang 2011/11/3
	 */
	public static void init() throws Exception{
		System.out.println("开始加载数据源...");
		if (null != data && !data.isEmpty()) {
			return;
		}
		data = new HashMap<String, String[]>();
		
		data.put("焦点新闻1", new String[]{SINA, "http://rss.sina.com.cn/news/china/politics15.xml"});
		data.put("焦点新闻2", new String[]{IFENG, "http://news.ifeng.com/mainland/"});
	}

  /**
	 * 获取对应的数据源信息
	 * @param intactTypeName 完整类型名称
	 * @author chitianxiang 2011/11/3
	 */
	public static List<String[]> getDataLst(String intactTypeName) 
			throws Exception{

		List<String[]> list = new ArrayList<String[]>();
		
		if (null == data) {
			init();
		}
		for (String str : data.keySet()) {
			if ((str.indexOf(intactTypeName) != -1) 
					&& (null != data.get(str))) {
				
				list.add(data.get(str));
			}
		}
		
		return list;
	}

       /**
	 * 抓取列表数据
	 * @param sourceName 网站名称
	 * @param sourceUrl 网址
	 * @param map 集合
	 * @author chitianxiang 2011/11/3
	 */
	public static void doGrab2Lst(String sourceName, String sourceUrl, Map map) throws Exception{
		if (SINA.equals(sourceName)) {
			grabLstBySina(sourceName, sourceUrl, map);
		} else if (IFENG.equals(sourceName)) {
			grabLstByIfeng(sourceName. sourceUrl, map);
		}
	}

   /**
	 * 抓取新浪阅读列表数据
         * @param sourceName 网站名称
         * @param sourceUrl 网址
	 * @param map 集合
	 * @author chitianxiang 2011/11/5
	 */
	private static void grabLstBySina(String sourceName, String sourceUrl, Map map)
			throws Exception{
		
		try {
			Document doc = getDocument(sourceUrl);
        Elements elements = doc.select("item");
			
			for (Element element : elements) {
				String title = element.select("title").text(); //标题
				String content = element.select("description").text(); //显示内容
				if ("".equals(showContent)) {
					continue;
				}
				String outUrl= element.select("guid").text(); //外链URl
				
			}
		} catch (Exception e) {
			System.out.println("抓取" + sourceName + "失败!!!");
		}
	}
 

 

 

 

相关标签: jsoup 抓取