欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

java实现登录之后抓取数据

程序员文章站 2024-02-15 14:45:10
最近做了一个从网络上抓取数据的一个小程序。主要关于信贷方面,收集的一些黑名单网站,从该网站上抓取到自己系统中。 也找了一些资料,觉得没有一个很好的,全面的例子。因此在这里...

最近做了一个从网络上抓取数据的一个小程序。主要关于信贷方面,收集的一些黑名单网站,从该网站上抓取到自己系统中。

也找了一些资料,觉得没有一个很好的,全面的例子。因此在这里做个笔记提醒自己。

首先需要一个jsoup的jar包,我用的1.6.0。。下载地址为:http://pan.baidu.com/s/1mgqouha

1,获取网页内容(核心代码,技术有限没封装)。

2,登录之后抓取网页数据(如何在请求中携带cookie)。

3,获取网站的ajax请求方法(返回json)。

以上这三点我就用一个类全部包含(比较糙望见谅,直接copy代码过去,应该就可以用)

一,这个类分别有这上面的1,2,3三中方法,直接main方法可以进行测试

package com.minxinloan.black.web.utils;

import java.io.bufferedreader;
import java.io.bytearrayoutputstream;
import java.io.datainputstream;
import java.io.dataoutputstream;
import java.io.file;
import java.io.fileoutputstream;
import java.io.filewriter;
import java.io.ioexception;
import java.io.inputstream;
import java.io.inputstreamreader;
import java.io.outputstream;
import java.io.printwriter;
import java.net.httpurlconnection;
import java.net.url;
import java.net.urlconnection;
import java.net.urlencoder;
import java.nio.charset.charset;
import java.util.arraylist;
import java.util.hashmap;
import java.util.iterator;
import java.util.list;
import java.util.map;
import java.util.map.entry;
import java.util.stringtokenizer;

import net.sf.json.jsonarray;
import net.sf.json.jsonobject;

import org.jsoup.connection;
import org.jsoup.connection.method;
import org.jsoup.jsoup;
import org.jsoup.nodes.document;
import org.jsoup.nodes.element;
import org.jsoup.select.elements;

public class cookieutil {

  public final static string content_type = "content-type";

  public static void main(string[] args) {
    
    //string loginurl = "http://www.p2peye.com/member.php?mod=logging&action=login&loginsubmit=yes&loginhash=lsc66&username=puqiuxiaomao&password=a1234567";
    string listurl = "http://www.p2peye.com/blacklist.php?p=2";
    string logurl = "http://www.p2peye.com/member.php";


    //********************************需要登录的*************************************************
    try {
        connection.response res = 
            jsoup.connect(logurl)
              .data("mod","logging"
                  ,"action","login"
                  ,"loginsubmit","yes"
                  ,"loginhash","lsc66"
                  ,"username","puqiuxiaomao"
                  ,"password","a1234567")
              .method(method.post)
              .execute();
        
        
        //这儿的sessionid需要根据要登录的目标网站设置的session cookie名字而定
        connection con=jsoup.connect(listurl);
        //设置访问形式(电脑访问,手机访问):直接百度都参数设置
        con.header("user-agent", "mozilla/4.0 (compatible; msie 7.0; windows nt 5.1)");
        //把登录信息的cookies保存如map对象里面
        map <string,string> map=res.cookies();
        iterator<entry<string,string>> it =map.entryset().iterator();
        while(it.hasnext()){
          entry<string,string> en= it.next(); 
          //把登录的信息放入请求里面
          con =con.cookie(en.getkey(), en.getvalue());
          
        }
        //再次获取document对象。
        document objectdoc = con.get();
        
        elements elements = objectdoc.getallelements();//获取这个连接返回页面的源码内容(不是源码跟源码差不多)
        for (element element : elements) {
          //element是迭代出来的标签:如:<div><span></span></div>
          elements elements2= element.getallelements();//
           for (element element2 : elements2) {
             element2.text();
             element2.attr("href");//获取标签属性。element2代表a标签:href代表属性
             element2.text();//获取标签文本
          }
        }
        
        //********************************不需要登录的*************************************************
        
        string url = "http://www.p2peye.com/blacklist.php?p=2";
        document contemp = jsoup.connect(url).get();
        elements elementstemps = contemp.getallelements();
         for (element elementstemp : elementstemps) {
           elementstemp.text();
           elementstemp.attr("href");//获取标签属性。element2代表a标签:href代表属性
           elementstemp.text();//获取标签文本
        }
        
        
        //********************************ajax方法获取内容。。。*************************************************。
         httpurlconnection connection = null;
          bufferedreader reader = null;
          try {
            stringbuffer sb = new stringbuffer();
            url geturl = new url(url);
            connection = (httpurlconnection)geturl.openconnection();
            reader = new bufferedreader(new inputstreamreader(
                connection.getinputstream(),"utf-8"));
            string lines;
            while ((lines = reader.readline()) != null) {
              sb.append(lines);
            };
            list<map<string, object>> list = parsejson2list(sb.tostring());//json转换成list
          } catch (exception e) {
            
          } finally{
            if(reader!=null)
              try {
                reader.close();
              } catch (ioexception e) {
              }
            // 断开连接
            connection.disconnect();
          }
        
    } catch (ioexception e) {
      // todo auto-generated catch block
      e.printstacktrace();
    }
    
  }
  

  public static map<string, object> parsejson2map(string jsonstr){ 
    map<string, object> map = new hashmap<string, object>(); 
    //最外层解析 
    jsonobject json = jsonobject.fromobject(jsonstr); 
    for(object k : json.keyset()){ 
      object v = json.get(k);  
      //如果内层还是数组的话,继续解析 
      if(v instanceof jsonarray){ 
        list<map<string, object>> list = new arraylist<map<string,object>>(); 
        iterator<jsonobject> it = ((jsonarray)v).iterator(); 
        while(it.hasnext()){ 
          jsonobject json2 = it.next(); 
          list.add(parsejson2map(json2.tostring())); 
        } 
        map.put(k.tostring(), list); 
      } else { 
        map.put(k.tostring(), v); 
      } 
    } 
    return map; 
  } 
  
  public static list<map<string, object>> parsejson2list(string jsonstr){ 
    jsonarray jsonarr = jsonarray.fromobject(jsonstr); 
    list<map<string, object>> list = new arraylist<map<string,object>>(); 
    iterator<jsonobject> it = jsonarr.iterator(); 
    while(it.hasnext()){ 
      jsonobject json2 = it.next(); 
      list.add(parsejson2map(json2.tostring())); 
    } 
    return list; 
  } 
  
  

}

二,这个是获取验证码的类,可以研究下。(但你要要分析出网站的验证码的请求地址)

package com.minxinloan.black.web.utils;

import java.io.bufferedreader;
import java.io.datainputstream;
import java.io.dataoutputstream;
import java.io.file;
import java.io.fileoutputstream;
import java.io.filewriter;
import java.io.inputstream;
import java.io.inputstreamreader;
import java.io.printwriter;
import java.net.httpurlconnection;
import java.net.url;
import java.net.urlconnection;
import java.nio.charset.charset;
import java.util.hashmap;
import java.util.list;
import java.util.map;
import java.util.stringtokenizer;

public class utils {//解析验证码的
  public static content getrandom(string method, string surl,// 要解析的url
      map<string, string> parammap, // 存放用户名和密码的map
      map<string, string> requestheadermap,// 存放cookie的map
      boolean isonlyreturnheader, string path) {

    content content = null;
    httpurlconnection httpurlconnection = null;
    inputstream in = null;
    try {
      url url = new url(surl);
      boolean ispost = "post".equals(method);
      if (method == null
          || (!"get".equalsignorecase(method) && !"post"
              .equalsignorecase(method))) {
        method = "post";
      }
      url resolvedurl = url;
      urlconnection urlconnection = resolvedurl.openconnection();
      httpurlconnection = (httpurlconnection) urlconnection;
      httpurlconnection.setrequestmethod(method);
      httpurlconnection.setrequestproperty("accept-language",
          "zh-cn,zh;q=0.5");
      // do not follow redirects, we will handle redirects ourself
      httpurlconnection.setinstancefollowredirects(false);
      httpurlconnection.setdooutput(true);
      httpurlconnection.setdoinput(true);
      httpurlconnection.setconnecttimeout(5000);
      httpurlconnection.setreadtimeout(5000);
      httpurlconnection.setusecaches(false);
      httpurlconnection.setdefaultusecaches(false);
      httpurlconnection.connect();

      int responsecode = httpurlconnection.getresponsecode();

      if (responsecode == httpurlconnection.http_ok
          || responsecode == httpurlconnection.http_created) {
        byte[] bytes = new byte[0];
        if (!isonlyreturnheader) {
          datainputstream ins = new datainputstream(
              httpurlconnection.getinputstream());
          // 验证码的位置
          dataoutputstream out = new dataoutputstream(
              new fileoutputstream(path + "/code.bmp"));
          byte[] buffer = new byte[4096];
          int count = 0;
          while ((count = ins.read(buffer)) > 0) {
            out.write(buffer, 0, count);
          }
          out.close();
          ins.close();
        }
        string encoding = null;
        if (encoding == null) {
          encoding = getencodingfromcontenttype(httpurlconnection
              .getheaderfield(""));
        }
        content = new content(surl, new string(bytes, encoding),
            httpurlconnection.getheaderfields());
      }
    } catch (exception e) {
      return null;
    } finally {
      if (httpurlconnection != null) {
        httpurlconnection.disconnect();
      }
    }
    return content;
  }

  public static string getencodingfromcontenttype(string contenttype) {
    string encoding = null;
    if (contenttype == null) {
      return null;
    }
    stringtokenizer tok = new stringtokenizer(contenttype, ";");
    if (tok.hasmoretokens()) {
      tok.nexttoken();
      while (tok.hasmoretokens()) {
        string assignment = tok.nexttoken().trim();
        int eqidx = assignment.indexof('=');
        if (eqidx != -1) {
          string varname = assignment.substring(0, eqidx).trim();
          if ("charset".equalsignorecase(varname)) {
            string varvalue = assignment.substring(eqidx + 1)
                .trim();
            if (varvalue.startswith("\"")
                && varvalue.endswith("\"")) {
              // substring works on indices
              varvalue = varvalue.substring(1,
                  varvalue.length() - 1);
            }
            if (charset.issupported(varvalue)) {
              encoding = varvalue;
            }
          }
        }
      }
    }
    if (encoding == null) {
      return "utf-8";
    }
    return encoding;
  }

  // 这个是输出
  public static boolean infile(string content, string path) {
    printwriter out = null;
    file file = new file(path);
    try {
      if (!file.exists()) {
        file.createnewfile();
      }
      out = new printwriter(new filewriter(file));

      out.write(content);
      out.flush();
      return true;
    } catch (exception e) {
      e.printstacktrace();
    } finally {
      out.close();
    }
    return false;
  }

  public static string gethtmlreadline(string httpurl) {
    string currentline = "";
    string totalstring = "";
    inputstream urlstream;
    string content = "";

    try {
      url url = new url(httpurl);

      httpurlconnection connection = (httpurlconnection) url
          .openconnection();

      connection.connect();
      system.out.println(connection.getresponsecode());
      urlstream = connection.getinputstream();

      bufferedreader reader = new bufferedreader(

      new inputstreamreader(urlstream, "utf-8"));

      while ((currentline = reader.readline()) != null) {
        totalstring += currentline + "\n";
      }

      content = totalstring;

    } catch (exception e) {
    }

    return content;
  }
}


class content {
  private string url;
  private string body;
  private map<string, list<string>> m_mheaders = new hashmap<string, list<string>>();

  public content(string url, string body, map<string, list<string>> headers) {
    this.url = url;
    this.body = body;
    this.m_mheaders = headers;
  }

  public string geturl() {
    return url;
  }

  public string getbody() {
    return body;
  }

  public map<string, list<string>> getheaders() {
    return m_mheaders;
  }

}