java多线程抓取铃声多多官网的铃声数据
程序员文章站
2024-03-11 18:34:13
一直想练习下java多线程抓取数据。
有天被我发现,铃声多多的官网(http://www.shoujiduoduo.com/main/)有大量的数据。
通过观察他们前端...
一直想练习下java多线程抓取数据。
有天被我发现,铃声多多的官网(http://www.shoujiduoduo.com/main/)有大量的数据。
通过观察他们前端获取铃声数据的ajax
http://www.shoujiduoduo.com/ringweb/ringweb.php?type=getlist&listid={类别id}&page={分页页码}
很容易就能发现通过改变 listid和page就能从服务器获取铃声的json数据, 通过解析json数据,
可以看到都带有{"hasmore":1,"curpage":1}这样子的指示,通过判断hasmore的值,决定是否进行下一页的抓取。
但是通过上面这个链接返回的json中不带有铃声的下载地址
很快就可以发现,点击页面的“下载”会看到
通过下面的请求,就可以获取铃声的下载地址了
http://www.shoujiduoduo.com/ringweb/ringweb.php?type=geturl&act=down&rid={铃声id}
所以,他们的数据是很容易被偷的。于是我就开始...
源码已经发在github上。如果感兴趣的童鞋可以查看
github:https://github.com/yongbo000/duoduoaudiorobot
上代码:
package me.yongbo.duoduoringrobot; import java.io.bufferedreader; import java.io.file; import java.io.filewriter; import java.io.ioexception; import java.io.inputstream; import java.io.inputstreamreader; import java.net.url; import java.net.urlconnection; import java.util.iterator; import java.util.regex.matcher; import java.util.regex.pattern; import com.google.gson.gson; import com.google.gson.jsonarray; import com.google.gson.jsonelement; import com.google.gson.jsonparser; /* * @author yongbo_ * @created 2013/4/16 * * */ public class duoduoringrobotclient implements runnable { public static string get_ringinfo_url = "http://www.shoujiduoduo.com/ringweb/ringweb.php?type=getlist&listid=%1$d&page=%2$d"; public static string get_down_url = "http://www.shoujiduoduo.com/ringweb/ringweb.php?type=geturl&act=down&rid=%1$d"; public static string error_msg = "listid为 %1$d 的robot发生错误,已自动停止。当前page为 %2$d";public static string status_msg = "开始抓取数据,当前listid: %1$d,当前page: %2$d"; public static string file_dir = "e:/ringdata/";public static string file_name = "listid=%1$d.txt";private boolean errorflag = false;private int listid;private int page; private int endpage = -1;private int hasmore = 1; private dbhelper dbhelper; /** * 构造函数 * @param listid 菜单id * @param page 开始页码 * @param endpage 结束页码 * */ public duoduoringrobotclient(int listid, int beginpage, int endpage) {this.listid = listid;this.page = beginpage;this.endpage = endpage;this.dbhelper = new dbhelper();} /** * 构造函数 * @param listid 菜单id * @param page 开始页码 * */ public duoduoringrobotclient(int listid, int page) {this(listid, page, -1);} /** * 获取铃声 * */public void getrings() {string url = string.format(get_ringinfo_url, listid, page);string responsestr = httpget(url);hasmore = gethasmore(responsestr); page = getnextpage(responsestr); ringparse(responsestr.replaceall("\\{\"hasmore\":[0-9]*,\"curpage\":[0-9]*\\},", "").replaceall(",]", "]"));}/** * 发起http请求 * @param weburl 请求连接地址 * */public string httpget(string weburl){url url;urlconnection conn;stringbuilder sb = new stringbuilder();string resultstr = "";try {url = new url(weburl);conn = url.openconnection();conn.connect();inputstream is = conn.getinputstream();inputstreamreader isr = new inputstreamreader(is);bufferedreader bufreader = new bufferedreader(isr);string linetext;while ((linetext = bufreader.readline()) != null) {sb.append(linetext);}resultstr = sb.tostring();} catch (exception e) {errorflag = true;//将错误写入txtwritetofile(string.format(error_msg, listid, page));}return resultstr;}/** * 将json字符串转化成ring对象,并存入txt中 * @param json json字符串 * */public void ringparse(string json) {ring ring = null;jsonelement element = new jsonparser().parse(json);jsonarray array = element.getasjsonarray();// 遍历数组iterator<jsonelement> it = array.iterator(); gson gson = new gson();while (it.hasnext() && !errorflag) {jsonelement e = it.next();// jsonelement转换为javabean对象ring = gson.fromjson(e, ring.class);ring.setdownurl(getringdownurl(ring.getid()));if(isavailablering(ring)) {system.out.println(ring.tostring()); //可选择写入数据库还是写入文本//writetofile(ring.tostring());writetodatabase(ring);}}} /** * 写入txt * @param data 字符串 * */public void writetofile(string data) {string path = file_dir + string.format(file_name, listid);file dir = new file(file_dir);file file = new file(path);filewriter fw = null;if(!dir.exists()){dir.mkdirs(); }try {if(!file.exists()){file.createnewfile();}fw = new filewriter(file, true); fw.write(data);fw.write("\r\n");fw.flush();} catch (ioexception e) { // todo auto-generated catch blocke.printstacktrace(); }finally {try {if(fw != null){fw.close();}} catch (ioexception e) { // todo auto-generated catch blocke.printstacktrace();}}}/** * 写入数据库 * @param ring 一个ring的实例 * */ public void writetodatabase(ring ring) {dbhelper.execute("addring", ring);} @overridepublic void run() {while(hasmore == 1 && !errorflag){if(endpage != -1){if(page > endpage) { break; }}system.out.println(string.format(status_msg, listid, page)); getrings();system.out.println(string.format("该页数据写入完成"));}system.out.println("ending...");} private int gethasmore(string resultstr){pattern p = pattern.compile("\"hasmore\":([0-9]*),\"curpage\":([0-9]*)"); matcher match = p.matcher(resultstr); if (match.find()) { return integer.parseint(match.group(1)); } return 0; } private int getnextpage(string resultstr){pattern p = pattern.compile("\"hasmore\":([0-9]*),\"curpage\":([0-9]*)");matcher match = p.matcher(resultstr);if (match.find()) {return integer.parseint(match.group(2));}return 0;} /** * 判断当前ring是否满足条件。当ring的name大于50个字符或是duration为小数则不符合条件,将被剔除。 * @param ring 当前ring对象实例 * */private boolean isavailablering(ring ring){pattern p = pattern.compile("^[1-9][0-9]*$"); matcher match = p.matcher(ring.getduration()); if(!match.find()){return false;}if(ring.getname().length() > 50 || ring.getartist().length() > 50 || ring.getdownurl().length() == 0){return false;}return true;} /** * 获取铃声的下载地址 * @param rid 铃声的id * */ public string getringdownurl(string rid){string url = string.format(get_down_url, rid); string responsestr = httpget(url);return responsestr;}}