欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

jsoup + json 解析网页

程序员文章站 2024-01-18 09:44:28
...
package com.teamdev.jxbrowser.chromium.demo_lingshui.baidunuomi.goods;

import java.awt.BorderLayout;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;

import javax.swing.JFrame;
import javax.swing.WindowConstants;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.hyjx.common.CommonUtil;
import com.hyjx.orclJdbcUtil.JDBCUtils;
import com.teamdev.jxbrowser.chromium.Browser;
import com.teamdev.jxbrowser.chromium.BrowserPreferences;
import com.teamdev.jxbrowser.chromium.JSValue;
import com.teamdev.jxbrowser.chromium.LoggerProvider;
import com.teamdev.jxbrowser.chromium.events.FinishLoadingEvent;
import com.teamdev.jxbrowser.chromium.events.LoadAdapter;
import com.teamdev.jxbrowser.chromium.swing.BrowserView;
/**
* 百度糯米(陵水市) plat_code(010) 美食
* @author 1
*
*/
public class ls_baidunuomi_goods_meishi{
public static void main(String[] args) throws Exception {


java.sql.Connection conOrcale  = null;
try {

conOrcale =    JDBCUtils.getConnection();

} catch (SQLException e1) {
e1.printStackTrace();
}
String sql = null;
PreparedStatement ps = null;
//创建添加sql
try{
sql = "insert into ls_nm_shop_good   "+
"  (good_id, good_name, good_url)  "+
"  values " +
"( ?   ,      ? ,     ?   )  ";
  
ps = conOrcale.prepareStatement(sql);
}catch (Exception e) {
e.printStackTrace();
}
String good_name="";
String good_url="";

       Document doc = null;
try{
doc = Jsoup.connect("https://lingshui.nuomi.com/326-page1?#j-sort-bar").userAgent("Mozilla")
.header("method", "GET")
.header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
.header("Accept-Encoding:","gzip, deflate, sdch")
.header("Accept-Language","zh-CN,zh;q=0.8")
.header("Cache-Control","max-age=0")
.header("Connection","keep-alive")
.header("Host","lingshui.nuomi.com")
.header("Upgrade-Insecure-Requests","1")
.header("User-Agent","Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36")
.ignoreContentType(true).timeout(200000).data()
.get();
             } catch (Exception e2) {
e2.printStackTrace();
}
String text = doc.html();
Document document = Jsoup.parse(text);
        String pag= document.getElementsByAttributeValue("class","page-number").text();
        int page_totle =Integer.parseInt(pag.substring(pag.length()-1));
        System.out.println(page_totle);
        //循环翻页
       
        for(int i=1;i<=page_totle;i++){
        //睡眠2秒
        try {
Thread.sleep(2000);
        } catch (InterruptedException e1) {
    e1.printStackTrace();
    }
       
       
        doc = null;
     try{
     doc = Jsoup.connect("https://lingshui.nuomi.com/326-page"+i+"?#j-sort-bar").userAgent("Mozilla")
     .header("method", "GET")
     .header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
     .header("Accept-Encoding:","gzip, deflate, sdch")
     .header("Accept-Language","zh-CN,zh;q=0.8")
     .header("Cache-Control","max-age=0")
     .header("Connection","keep-alive")
     .header("Host","lingshui.nuomi.com")
     .header("Upgrade-Insecure-Requests","1")
     .header("User-Agent","Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36")
     .ignoreContentType(true).timeout(200000).data()
     .get();
                  } catch (Exception e2) {
     e2.printStackTrace();
     }
     text = doc.html();
     document = Jsoup.parse(text);
     Elements elements= document.getElementsByAttributeValue("class","itemlist clearfix").select("li");
     System.out.println("elements.size()"+elements.size());
    
     for(Element e : elements){
     Document parse = Jsoup.parse(e.html());
       good_name = parse.getElementsByAttributeValue("class","title").select("h4").text();
       good_url ="https:"+parse.getElementsByAttributeValue("class","contentbox").select("a").attr("href");
       System.out.println("good_name:"+good_name);
     System.out.println("good_url:"+good_url);
     try {
             //  (good_id, good_name, good_url)
     ps.setString(1,CommonUtil.getUUID32());
     ps.setString(2,good_name);
     ps.setString(3,good_url);
     ps.executeUpdate();
     } catch (Exception e1) {
     // TODO Auto-generated catch block
     e1.printStackTrace();
     }
     }
    
/**
* 第一次加载
* */     
     if(i==1){
        doc = null;
     try{
     doc = Jsoup.connect("https://lingshui.nuomi.com/326?async_load_page=2&_=1477897343482").userAgent("Mozilla")
     .header("method", "GET")
     .header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
     .header("Accept-Encoding:","gzip, deflate, sdch")
     .header("Accept-Language","zh-CN,zh;q=0.8")
     .header("Cache-Control","max-age=0")
     .header("Connection","keep-alive")
     .header("Host","lingshui.nuomi.com")
     .header("Upgrade-Insecure-Requests","1")
     .header("User-Agent","Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36")
     .ignoreContentType(true).timeout(200000).data()
     .get();
                  } catch (Exception e2) {
     e2.printStackTrace();
     }
     text = doc.html();
     document = Jsoup.parse(text);
     elements= document.select("li");
     System.out.println("elements.size()"+elements.size());
    
     for(Element e : elements){
     Document parse = Jsoup.parse(e.html());
       good_name = parse.getElementsByAttributeValue("class","title").select("h4").text();
       good_url ="https:"+parse.getElementsByAttributeValue("class","contentbox").select("a").attr("href");
       System.out.println("good_name:"+good_name);
     System.out.println("good_url:"+good_url);
     try {
             //  (good_id, good_name, good_url)
     ps.setString(1,CommonUtil.getUUID32());
     ps.setString(2,good_name);
     ps.setString(3,good_url);
     ps.executeUpdate();
     } catch (Exception e1) {
     // TODO Auto-generated catch block
     e1.printStackTrace();
     }
     }
     doc = null;
     try{
     doc = Jsoup.connect("https://lingshui.nuomi.com/326?async_load_page=3&_=1477897343484").userAgent("Mozilla")
     .header("method", "GET")
     .header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
     .header("Accept-Encoding:","gzip, deflate, sdch")
     .header("Accept-Language","zh-CN,zh;q=0.8")
     .header("Cache-Control","max-age=0")
     .header("Connection","keep-alive")
     .header("Host","lingshui.nuomi.com")
     .header("Upgrade-Insecure-Requests","1")
     .header("User-Agent","Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36")
     .ignoreContentType(true).timeout(200000).data()
     .get();
                  } catch (Exception e2) {
     e2.printStackTrace();
     }
     text = doc.html();
     document = Jsoup.parse(text);
       elements= document.select("li");
     System.out.println("elements.size()"+elements.size());
    
     for(Element e : elements){
     Document parse = Jsoup.parse(e.html());
       good_name = parse.getElementsByAttributeValue("class","title").select("h4").text();
       good_url ="https:"+parse.getElementsByAttributeValue("class","contentbox").select("a").attr("href");
       System.out.println("good_name:"+good_name);
     System.out.println("good_url:"+good_url);
     try {
             //  (good_id, good_name, good_url)
     ps.setString(1,CommonUtil.getUUID32());
     ps.setString(2,good_name);
     ps.setString(3,good_url);
     ps.executeUpdate();
     } catch (Exception e1) {
     // TODO Auto-generated catch block
     e1.printStackTrace();
     }
     }
    
        }
     System.out.println("第"+i+"页");
        }
       
       
      

      
       
}
}
相关标签: html jsoup