java实现登录之后抓取数据
程序员文章站
2024-02-15 14:45:10
最近做了一个从网络上抓取数据的一个小程序。主要关于信贷方面,收集的一些黑名单网站,从该网站上抓取到自己系统中。
也找了一些资料,觉得没有一个很好的,全面的例子。因此在这里...
最近做了一个从网络上抓取数据的一个小程序。主要关于信贷方面,收集的一些黑名单网站,从该网站上抓取到自己系统中。
也找了一些资料,觉得没有一个很好的,全面的例子。因此在这里做个笔记提醒自己。
首先需要一个jsoup的jar包,我用的1.6.0。。下载地址为:http://pan.baidu.com/s/1mgqouha
1,获取网页内容(核心代码,技术有限没封装)。
2,登录之后抓取网页数据(如何在请求中携带cookie)。
3,获取网站的ajax请求方法(返回json)。
以上这三点我就用一个类全部包含(比较糙望见谅,直接copy代码过去,应该就可以用)
一,这个类分别有这上面的1,2,3三中方法,直接main方法可以进行测试
package com.minxinloan.black.web.utils; import java.io.bufferedreader; import java.io.bytearrayoutputstream; import java.io.datainputstream; import java.io.dataoutputstream; import java.io.file; import java.io.fileoutputstream; import java.io.filewriter; import java.io.ioexception; import java.io.inputstream; import java.io.inputstreamreader; import java.io.outputstream; import java.io.printwriter; import java.net.httpurlconnection; import java.net.url; import java.net.urlconnection; import java.net.urlencoder; import java.nio.charset.charset; import java.util.arraylist; import java.util.hashmap; import java.util.iterator; import java.util.list; import java.util.map; import java.util.map.entry; import java.util.stringtokenizer; import net.sf.json.jsonarray; import net.sf.json.jsonobject; import org.jsoup.connection; import org.jsoup.connection.method; import org.jsoup.jsoup; import org.jsoup.nodes.document; import org.jsoup.nodes.element; import org.jsoup.select.elements; public class cookieutil { public final static string content_type = "content-type"; public static void main(string[] args) { //string loginurl = "http://www.p2peye.com/member.php?mod=logging&action=login&loginsubmit=yes&loginhash=lsc66&username=puqiuxiaomao&password=a1234567"; string listurl = "http://www.p2peye.com/blacklist.php?p=2"; string logurl = "http://www.p2peye.com/member.php"; //********************************需要登录的************************************************* try { connection.response res = jsoup.connect(logurl) .data("mod","logging" ,"action","login" ,"loginsubmit","yes" ,"loginhash","lsc66" ,"username","puqiuxiaomao" ,"password","a1234567") .method(method.post) .execute(); //这儿的sessionid需要根据要登录的目标网站设置的session cookie名字而定 connection con=jsoup.connect(listurl); //设置访问形式(电脑访问,手机访问):直接百度都参数设置 con.header("user-agent", "mozilla/4.0 (compatible; msie 7.0; windows nt 5.1)"); //把登录信息的cookies保存如map对象里面 map <string,string> map=res.cookies(); iterator<entry<string,string>> it =map.entryset().iterator(); while(it.hasnext()){ entry<string,string> en= it.next(); //把登录的信息放入请求里面 con =con.cookie(en.getkey(), en.getvalue()); } //再次获取document对象。 document objectdoc = con.get(); elements elements = objectdoc.getallelements();//获取这个连接返回页面的源码内容(不是源码跟源码差不多) for (element element : elements) { //element是迭代出来的标签:如:<div><span></span></div> elements elements2= element.getallelements();// for (element element2 : elements2) { element2.text(); element2.attr("href");//获取标签属性。element2代表a标签:href代表属性 element2.text();//获取标签文本 } } //********************************不需要登录的************************************************* string url = "http://www.p2peye.com/blacklist.php?p=2"; document contemp = jsoup.connect(url).get(); elements elementstemps = contemp.getallelements(); for (element elementstemp : elementstemps) { elementstemp.text(); elementstemp.attr("href");//获取标签属性。element2代表a标签:href代表属性 elementstemp.text();//获取标签文本 } //********************************ajax方法获取内容。。。*************************************************。 httpurlconnection connection = null; bufferedreader reader = null; try { stringbuffer sb = new stringbuffer(); url geturl = new url(url); connection = (httpurlconnection)geturl.openconnection(); reader = new bufferedreader(new inputstreamreader( connection.getinputstream(),"utf-8")); string lines; while ((lines = reader.readline()) != null) { sb.append(lines); }; list<map<string, object>> list = parsejson2list(sb.tostring());//json转换成list } catch (exception e) { } finally{ if(reader!=null) try { reader.close(); } catch (ioexception e) { } // 断开连接 connection.disconnect(); } } catch (ioexception e) { // todo auto-generated catch block e.printstacktrace(); } } public static map<string, object> parsejson2map(string jsonstr){ map<string, object> map = new hashmap<string, object>(); //最外层解析 jsonobject json = jsonobject.fromobject(jsonstr); for(object k : json.keyset()){ object v = json.get(k); //如果内层还是数组的话,继续解析 if(v instanceof jsonarray){ list<map<string, object>> list = new arraylist<map<string,object>>(); iterator<jsonobject> it = ((jsonarray)v).iterator(); while(it.hasnext()){ jsonobject json2 = it.next(); list.add(parsejson2map(json2.tostring())); } map.put(k.tostring(), list); } else { map.put(k.tostring(), v); } } return map; } public static list<map<string, object>> parsejson2list(string jsonstr){ jsonarray jsonarr = jsonarray.fromobject(jsonstr); list<map<string, object>> list = new arraylist<map<string,object>>(); iterator<jsonobject> it = jsonarr.iterator(); while(it.hasnext()){ jsonobject json2 = it.next(); list.add(parsejson2map(json2.tostring())); } return list; } }
二,这个是获取验证码的类,可以研究下。(但你要要分析出网站的验证码的请求地址)
package com.minxinloan.black.web.utils; import java.io.bufferedreader; import java.io.datainputstream; import java.io.dataoutputstream; import java.io.file; import java.io.fileoutputstream; import java.io.filewriter; import java.io.inputstream; import java.io.inputstreamreader; import java.io.printwriter; import java.net.httpurlconnection; import java.net.url; import java.net.urlconnection; import java.nio.charset.charset; import java.util.hashmap; import java.util.list; import java.util.map; import java.util.stringtokenizer; public class utils {//解析验证码的 public static content getrandom(string method, string surl,// 要解析的url map<string, string> parammap, // 存放用户名和密码的map map<string, string> requestheadermap,// 存放cookie的map boolean isonlyreturnheader, string path) { content content = null; httpurlconnection httpurlconnection = null; inputstream in = null; try { url url = new url(surl); boolean ispost = "post".equals(method); if (method == null || (!"get".equalsignorecase(method) && !"post" .equalsignorecase(method))) { method = "post"; } url resolvedurl = url; urlconnection urlconnection = resolvedurl.openconnection(); httpurlconnection = (httpurlconnection) urlconnection; httpurlconnection.setrequestmethod(method); httpurlconnection.setrequestproperty("accept-language", "zh-cn,zh;q=0.5"); // do not follow redirects, we will handle redirects ourself httpurlconnection.setinstancefollowredirects(false); httpurlconnection.setdooutput(true); httpurlconnection.setdoinput(true); httpurlconnection.setconnecttimeout(5000); httpurlconnection.setreadtimeout(5000); httpurlconnection.setusecaches(false); httpurlconnection.setdefaultusecaches(false); httpurlconnection.connect(); int responsecode = httpurlconnection.getresponsecode(); if (responsecode == httpurlconnection.http_ok || responsecode == httpurlconnection.http_created) { byte[] bytes = new byte[0]; if (!isonlyreturnheader) { datainputstream ins = new datainputstream( httpurlconnection.getinputstream()); // 验证码的位置 dataoutputstream out = new dataoutputstream( new fileoutputstream(path + "/code.bmp")); byte[] buffer = new byte[4096]; int count = 0; while ((count = ins.read(buffer)) > 0) { out.write(buffer, 0, count); } out.close(); ins.close(); } string encoding = null; if (encoding == null) { encoding = getencodingfromcontenttype(httpurlconnection .getheaderfield("")); } content = new content(surl, new string(bytes, encoding), httpurlconnection.getheaderfields()); } } catch (exception e) { return null; } finally { if (httpurlconnection != null) { httpurlconnection.disconnect(); } } return content; } public static string getencodingfromcontenttype(string contenttype) { string encoding = null; if (contenttype == null) { return null; } stringtokenizer tok = new stringtokenizer(contenttype, ";"); if (tok.hasmoretokens()) { tok.nexttoken(); while (tok.hasmoretokens()) { string assignment = tok.nexttoken().trim(); int eqidx = assignment.indexof('='); if (eqidx != -1) { string varname = assignment.substring(0, eqidx).trim(); if ("charset".equalsignorecase(varname)) { string varvalue = assignment.substring(eqidx + 1) .trim(); if (varvalue.startswith("\"") && varvalue.endswith("\"")) { // substring works on indices varvalue = varvalue.substring(1, varvalue.length() - 1); } if (charset.issupported(varvalue)) { encoding = varvalue; } } } } } if (encoding == null) { return "utf-8"; } return encoding; } // 这个是输出 public static boolean infile(string content, string path) { printwriter out = null; file file = new file(path); try { if (!file.exists()) { file.createnewfile(); } out = new printwriter(new filewriter(file)); out.write(content); out.flush(); return true; } catch (exception e) { e.printstacktrace(); } finally { out.close(); } return false; } public static string gethtmlreadline(string httpurl) { string currentline = ""; string totalstring = ""; inputstream urlstream; string content = ""; try { url url = new url(httpurl); httpurlconnection connection = (httpurlconnection) url .openconnection(); connection.connect(); system.out.println(connection.getresponsecode()); urlstream = connection.getinputstream(); bufferedreader reader = new bufferedreader( new inputstreamreader(urlstream, "utf-8")); while ((currentline = reader.readline()) != null) { totalstring += currentline + "\n"; } content = totalstring; } catch (exception e) { } return content; } } class content { private string url; private string body; private map<string, list<string>> m_mheaders = new hashmap<string, list<string>>(); public content(string url, string body, map<string, list<string>> headers) { this.url = url; this.body = body; this.m_mheaders = headers; } public string geturl() { return url; } public string getbody() { return body; } public map<string, list<string>> getheaders() { return m_mheaders; } }